DAG

library(DiagrammeR) 
# Nodes
 #node [shape = box]
 # S [label = 'Matched\n(S=1)',fontsize=7]
 # C [label = 'Not censored\n(C=0)',fontsize=7]
gr1<-
DiagrammeR::grViz("
digraph causal {

# Nodes
  node [shape = plaintext]
  a [label = 'Observed\nConfounders\n(Z)',fontsize=10]
  b [label = 'Unobserved\nConfounders\n(U)',fontsize=10]
  c [label = 'Early\nDrop-out\n(Y)',fontsize=10]
  d [label = 'Residential\nPrograms\n(X)',fontsize=10]

# Edges
  edge [color = black,
        arrowhead = vee]
  rankdir = TB;
  
  b -> c 
  b -> a 
  a -> c  

  d -> c [minlen=1]
  d -> a [minlen=1]
  
 # a -> S #[minlen=1]
 # Z -> S #[minlen=1]
  
#  a -> C #[minlen=3]
#  Z -> C #[minlen=3]
  { rank = same; b; a; c }
# { rank = same; S; C }
  { rankdir = LR; a; d }

# Graph
  graph [overlap = true]
}")
gr1

Figure 1. Directed Acyclic Graph

#  {rank=same ; A -> B -> C -> D};
#       {rank=same ;           F -> E[dir=back]};
#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3733703/
#Cohort matching on a variable associated with both outcome and censoring
#Cohort matching on a confounder. We let A denote an exposure, Y denote an outcome, and C denote a confounder and matching variable. The variable S indicates whether an individual in the source population is selected for the matched study (1: selected, 0: not selected). See Section 2-7 for details.
#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7064555/
gr2<-
DiagrammeR::grViz("
digraph causal {

  # Nodes
  node [shape = plaintext]
  a [label = 'Residential\nPrograms\n(X)',fontsize=10]
  b [label = 'Unobserved\nConfounders\n(U)',fontsize=10]
  c [label = 'Early\nDrop-out\n(Y)',fontsize=10]
  d [label = 'Observed\nConfounders\n(Z)',fontsize=10]

  # Edges
  edge [color = black,
        arrowhead = vee]
  rankdir = TB
  a -> c [minlen=3]
  d -> a [minlen=3]
  d -> c [minlen=9]
  
  b -> a [minlen=1]
  b -> c
  
{ rank = same; c; d }
#{ rank = same; b; d }
  rankdir = TB
{ rank = same; d; c } #Ver si lo saco, creo que da problemas
  
  # Graph
  graph [overlap = true]
}")#LR

Balance

We selected treatments at baseline for each user, leaving 85,048 observations. Then, we distinguished between residential 12,706 and ambulatory (72,267) treatments. We imputed cases that did not have a defined treatment assigned 75.


We selected the following variables of interest:

  • “Starting Substance” (sus_ini_mvv)
  • “Marital Status” (estado_conyugal_2)
  • “Educational Attainment” (escolaridad_rec)
  • “Age of Onset of Drug Use” (edad_ini_cons)
  • “Frequency of use of primary drug” (freq_cons_sus_prin)
  • “Motive of Admission to Treatment” (origen_ingreso_mod)
  • “Psychiatric co-morbidity” (dg_cie_10_rec)
  • “Drug Dependence” (dg_trs_cons_sus_or)
  • “Chilean Region of the Center” (nombre_region)
  • “Type of Center (Public)” (tipo_centro_pub)
  • “Sex” (sexo_2)
  • “Age at Admission to Treatment” (edad_al_ing)
  • “Date of Admission to Treatment” (fech_ing_num)
  • “Evaluation of the Therapeutic Process” (*) (evaluacindelprocesoteraputico)
  • “Early Dropout (Against Staff Advice)” (abandono_temprano_rec) (Y)
  • “Residential Type of Plan” (tipo_de_plan_res) (Z)


library(compareGroups)

match.on_tot <- c("row", "hash_key","sus_ini_mod_mvv","estado_conyugal_2","escolaridad_rec","edad_ini_cons","freq_cons_sus_prin","origen_ingreso_mod","dg_cie_10_rec","nombre_region","tipo_centro_pub","sexo_2","edad_al_ing","fech_ing_num","abandono_temprano_rec","tipo_de_plan_res","duplicates_filtered","dg_trs_cons_sus_or","evaluacindelprocesoteraputico")
#dg_trs_cons_sus_or

CONS_C1_df_dup_SEP_2020_match<-
  CONS_C1_df_dup_SEP_2020 %>% 
  dplyr::filter(dup==1) %>% #, tipo_de_plan_2 %in% c("PG-PR","M-PR","PG-PAI","M-PAI","PG-PAB","M-PAB")
  dplyr::mutate(tipo_de_plan_res=dplyr::case_when(grepl("PR",as.character(tipo_de_plan_2))~1,
                                                  grepl("PAI",as.character(tipo_de_plan_2))~0,
                                                  grepl("PAB",as.character(tipo_de_plan_2))~0,
                                                  TRUE~NA_real_)) %>% 
  dplyr::mutate(tipo_de_plan_res=factor(tipo_de_plan_res)) %>% 
  dplyr::mutate(abandono_temprano_rec=factor(if_else(as.character(motivodeegreso_mod_imp)=="Early Drop-out",TRUE,FALSE,NA))) %>% 
  dplyr::mutate(dg_trs_cons_sus_or=factor(if_else(as.character(dg_trs_cons_sus_or)=="Drug dependence",TRUE,FALSE,NA))) %>% 
  dplyr::mutate(tipo_centro_pub=factor(if_else(as.character(tipo_centro)=="Public",TRUE,FALSE,NA))) %>% 
  dplyr::mutate(condicion_ocupacional_corr=factor(condicion_ocupacional_corr),cat_ocupacional_corr=factor(cat_ocupacional_corr)) %>% 
  dplyr::mutate(dg_trs_fis_rec=factor(dplyr::case_when(as.character(diagnostico_trs_fisico)=="En estudio"~"Diagnosis unknown (under study)",as.character(diagnostico_trs_fisico)=="Sin trastorno"~'Without physical comorbidity',cnt_diagnostico_trs_fisico>0 ~'With physical comorbidity',
                                             TRUE~NA_character_)))%>%
    dplyr::mutate(escolaridad_rec=parse_factor(as.character(escolaridad_rec),levels=c('3-Completed primary school or less', '2-Completed high school or less', '1-More than high school'), ordered=T,trim_ws=T,include_na =F, locale=locale(encoding = "Latin1"))) %>%   
dplyr::mutate(freq_cons_sus_prin=parse_factor(as.character(freq_cons_sus_prin),levels=c('Did not use', 'Less than 1 day a week','2 to 3 days a week','4 to 6 days a week','1 day a week or more','Daily'), ordered =T,trim_ws=T,include_na =F, locale=locale(encoding = "UTF-8"))) %>% 
  dplyr::mutate(evaluacindelprocesoteraputico=dplyr::case_when(grepl("1",as.character(evaluacindelprocesoteraputico))~'1-High Achievement',grepl("2",as.character(evaluacindelprocesoteraputico))~'2-Medium Achievement',grepl("3",as.character(evaluacindelprocesoteraputico))~'3-Minimum Achievement', TRUE~as.character(evaluacindelprocesoteraputico))) %>% 
  dplyr::mutate(evaluacindelprocesoteraputico=parse_factor(as.character(evaluacindelprocesoteraputico),levels=c('1-High Achievement', '2-Medium Achievement','3-Minimum Achievement'), ordered =T,trim_ws=T,include_na =F, locale=locale(encoding = "UTF-8"))) %>% 
  dplyr::select_(.dots = match.on_tot) %>% 
  dplyr::mutate(more_one_treat=factor(ifelse(duplicates_filtered>1,1,0))) %>% 
  data.table::data.table()
## Warning: `select_()` was deprecated in dplyr 0.7.0.
## Please use `select()` instead.
#CONS_C1_df_dup_SEP_2020_match %>% 
  #dplyr::group_by(dg_trs_fis) %>% dplyr::summarise(q1=quantile(dias_treat_imp_sin_na,.25),q2=quantile(dias_treat_imp_sin_na,.5),q3=quantile(dias_treat_imp_sin_na,.75)) ---> las distribuciones por días de tratamiento de las categorías de respuesta tienden a ser bastante similares, aunquequienes tienen una comorbiliad física definida tienen más tiempo en el estudio.
invisible("La diferencia en días de tratamiento entre las categorías de enfermedad psiquiátrica, indica que quienes se encuentran en estudio tienen muchos menos días en tratamiento que quienes no tienen una comorbilidad o quienes tienen una definida. No es lo mismo con el caso de la enfermedad física, en donde tienden a ser bastante similares")

invisible("Decidí no incluir diagnóstico de enferemedad física, porque hay algunas condiciones que son crónicas o que pueden serlo, y que no tengo cómo validarlas a lo largo del tratamiento")
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

attr(CONS_C1_df_dup_SEP_2020_match$sus_ini_mod_mvv,"label")<-"Starting Substance"
attr(CONS_C1_df_dup_SEP_2020_match$estado_conyugal_2,"label")<-"Marital Status"
attr(CONS_C1_df_dup_SEP_2020_match$escolaridad_rec,"label")<-"Educational Attainment"
attr(CONS_C1_df_dup_SEP_2020_match$edad_ini_cons,"label")<-"Age of Onset of Drug Use"
attr(CONS_C1_df_dup_SEP_2020_match$freq_cons_sus_prin,"label")<-"Frequency of use of primary drug"
attr(CONS_C1_df_dup_SEP_2020_match$origen_ingreso_mod,"label")<-"Motive of Admission to Treatment"
attr(CONS_C1_df_dup_SEP_2020_match$dg_cie_10_rec,"label")<-"Psychiatric co-morbidity"
attr(CONS_C1_df_dup_SEP_2020_match$nombre_region,"label")<-"Chilean Region of the Center"
attr(CONS_C1_df_dup_SEP_2020_match$tipo_centro_pub,"label")<-"Type of Center (Public)"
attr(CONS_C1_df_dup_SEP_2020_match$sexo_2,"label")<-"Sex"
attr(CONS_C1_df_dup_SEP_2020_match$edad_al_ing,"label")<-"Age at Admission"
attr(CONS_C1_df_dup_SEP_2020_match$fech_ing_num,"label")<-"Date of Admission to Treatment"
attr(CONS_C1_df_dup_SEP_2020_match$abandono_temprano_rec,"label")<-"Early Dropout"
attr(CONS_C1_df_dup_SEP_2020_match$tipo_de_plan_res,"label")<-"Residential Type of Plan"
attr(CONS_C1_df_dup_SEP_2020_match$duplicates_filtered,"label")<-"No. of Treatments in the Database"
attr(CONS_C1_df_dup_SEP_2020_match$dg_trs_cons_sus_or,"label")<-"Drug Dependence"
attr(CONS_C1_df_dup_SEP_2020_match$evaluacindelprocesoteraputico,"label")<-"Evaluation of the Therapeutic Process"

knitr::opts_chunk$set(echo = FALSE, warning=FALSE, message=FALSE)

table1_all <- suppressWarnings(compareGroups(tipo_de_plan_res ~ sus_ini_mod_mvv+ estado_conyugal_2+ escolaridad_rec+ edad_ini_cons+ freq_cons_sus_prin+ origen_ingreso_mod+ dg_cie_10_rec+ nombre_region+ tipo_centro_pub+ sexo_2+ dg_trs_cons_sus_or+ edad_al_ing+ fech_ing_num+ abandono_temprano_rec+ duplicates_filtered+ dg_trs_cons_sus_or+ evaluacindelprocesoteraputico, method= c(
                                            sus_ini_mod_mvv=3,
                                            estado_conyugal_2=3,
                                            escolaridad_rec=3,
                                            edad_ini_cons=3,
                                            freq_cons_sus_prin=3,
                                            origen_ingreso_mod=3,
                                            dg_cie_10_rec=3,
                                            dg_trs_cons_sus_or=3,
                                            nombre_region=3,
                                            tipo_centro_pub=3,
                                            sexo_2=3,
                                            dg_trs_cons_sus_or=3,
                                            edad_al_ing=2,
                                            fech_ing_num=2,
                                            abandono_temprano_rec=3,
                                            duplicates_filtered=3,
                                            evaluacindelprocesoteraputico=3),
                       data = CONS_C1_df_dup_SEP_2020_match,
                       include.miss = T,
                       var.equal=T)
)
table1_more_one <- suppressWarnings(compareGroups(tipo_de_plan_res ~ sus_ini_mod_mvv+ estado_conyugal_2+ escolaridad_rec+ edad_ini_cons+ freq_cons_sus_prin+ origen_ingreso_mod+ dg_cie_10_rec+ dg_trs_cons_sus_or+ nombre_region+ tipo_centro_pub+ sexo_2+ dg_trs_cons_sus_or+ edad_al_ing+ fech_ing_num+ abandono_temprano_rec+ evaluacindelprocesoteraputico, method= c(
                                            sus_ini_mod_mvv=3,
                                            estado_conyugal_2=3,
                                            escolaridad_rec=3,
                                            edad_ini_cons=3,
                                            freq_cons_sus_prin=3,
                                            origen_ingreso_mod=3,
                                            dg_cie_10_rec=3,
                                            dg_trs_cons_sus_or=3,
                                            nombre_region=3,
                                            tipo_centro_pub=3,
                                            sexo_2=3,
                                            dg_trs_cons_sus_or=3,
                                            edad_al_ing=2,
                                            fech_ing_num=2,
                                            abandono_temprano_rec=3,
                                            evaluacindelprocesoteraputico=3),
                       data = CONS_C1_df_dup_SEP_2020_match,
                       include.miss = T,
                       var.equal=T,
                       subset= more_one_treat==1)
)
table1_only_one <- suppressWarnings(compareGroups(tipo_de_plan_res ~ sus_ini_mod_mvv+ estado_conyugal_2+ escolaridad_rec+ edad_ini_cons+ freq_cons_sus_prin+ origen_ingreso_mod+ dg_cie_10_rec+ dg_trs_cons_sus_or+ nombre_region+ tipo_centro_pub+ sexo_2+ dg_trs_cons_sus_or+ edad_al_ing+ fech_ing_num+ abandono_temprano_rec+ evaluacindelprocesoteraputico, method= c(
                                            sus_ini_mod_mvv=3,
                                            estado_conyugal_2=3,
                                            escolaridad_rec=3,
                                            edad_ini_cons=3,
                                            freq_cons_sus_prin=3,
                                            origen_ingreso_mod=3,
                                            dg_cie_10_rec=3,
                                            dg_trs_cons_sus_or=3,
                                            nombre_region=3,
                                            tipo_centro_pub=3,
                                            sexo_2=3,
                                            dg_trs_cons_sus_or=3,
                                            edad_al_ing=2,
                                            fech_ing_num=2,
                                            abandono_temprano_rec=3,
                                            evaluacindelprocesoteraputico=3),
                       data = CONS_C1_df_dup_SEP_2020_match,
                       include.miss = T,
                       var.equal=T,
                       subset= more_one_treat==0)
)
 #Possible values are: 1 - for analysis as "normal-distributed"; 2 - forces analysis as "continuous non-normal"; 3 - forces analysis as "categorical"; and 4 - NA, which performs a Shapiro-Wilks test to decide between normal or non-normal. 

restab1_all <- createTable(table1_all, show.p.overall = T)
restab1_more_one <- createTable(table1_more_one, show.p.overall = T)
restab1_only_one <- createTable(table1_only_one, show.p.overall = T)

pvals1 <- getResults(table1_all)
#p.adjust(pvals, method = "BH")
 export2md(restab1_all, size=11, first.strip=T, hide.no="no", position="center",
           format="html",caption= "Table 1. Summary descriptives at baseline, between Users with Residential and Ambulatory Treatments from 2010-2019",col.names=c("Variables","Residential", "Ambulatory", "p-value"))%>%
  kableExtra::add_footnote(c("Note. Continuous variables are presented as Medians and Percentiles 25 and 75 were shown;", "Categorical variables are presented as number (%)"), notation = "none")%>%
  kableExtra::scroll_box(width = "100%", height = "375px")
Table 1. Summary descriptives at baseline, between Users with Residential and Ambulatory Treatments from 2010-2019
Variables Residential Ambulatory p-value
N=72267 N=12706
Starting Substance: 0.000
Alcohol 41507 (57.4%) 5080 (40.0%)
Cocaine hydrochloride 2682 (3.71%) 477 (3.75%)
Marijuana 18412 (25.5%) 4556 (35.9%)
Other 1669 (2.31%) 318 (2.50%)
Cocaine paste 2767 (3.83%) 1086 (8.55%)
‘Missing’ 5230 (7.24%) 1189 (9.36%)
Marital Status: <0.001
Married/Shared living arrangements 26185 (36.2%) 2910 (22.9%)
Separated/Divorced 7721 (10.7%) 1320 (10.4%)
Single 37343 (51.7%) 8328 (65.5%)
Widower 869 (1.20%) 133 (1.05%)
‘Missing’ 149 (0.21%) 15 (0.12%)
Educational Attainment: <0.001
3-Completed primary school or less 20062 (27.8%) 3862 (30.4%)
2-Completed high school or less 39565 (54.7%) 7044 (55.4%)
1-More than high school 12279 (17.0%) 1777 (14.0%)
‘Missing’ 361 (0.50%) 23 (0.18%)
Frequency of use of primary drug: 0.000
Did not use 1095 (1.52%) 85 (0.67%)
Less than 1 day a week 2862 (3.96%) 133 (1.05%)
2 to 3 days a week 22372 (31.0%) 1329 (10.5%)
4 to 6 days a week 12258 (17.0%) 1654 (13.0%)
1 day a week or more 5335 (7.38%) 272 (2.14%)
Daily 27938 (38.7%) 9219 (72.6%)
‘Missing’ 407 (0.56%) 14 (0.11%)
Motive of Admission to Treatment: 0.000
Spontaneous 33720 (46.7%) 4273 (33.6%)
Assisted Referral 4950 (6.85%) 3013 (23.7%)
Other 3766 (5.21%) 740 (5.82%)
Justice Sector 7159 (9.91%) 812 (6.39%)
Health Sector 22672 (31.4%) 3868 (30.4%)
Psychiatric co-morbidity: <0.001
Without psychiatric comorbidity 29070 (40.2%) 3245 (25.5%)
Diagnosis unknown (under study) 13310 (18.4%) 2771 (21.8%)
With psychiatric comorbidity 29887 (41.4%) 6690 (52.7%)
Type of Center (Public): 0.000
FALSE 14964 (20.7%) 9066 (71.4%)
TRUE 57300 (79.3%) 3623 (28.5%)
‘Missing’ 3 (0.00%) 17 (0.13%)
Sex: <0.001
Men 54806 (75.8%) 8761 (69.0%)
Women 17461 (24.2%) 3945 (31.0%)
Drug Dependence: 0.000
FALSE 22150 (30.7%) 1049 (8.26%)
TRUE 50116 (69.3%) 11657 (91.7%)
‘Missing’ 1 (0.00%) 0 (0.00%)
Age at Admission 34.5 [27.6;43.5] 32.6 [26.3;40.9] <0.001
Date of Admission to Treatment 16577 [15730;17359] 16154 [15342;17023] <0.001
Early Dropout: <0.001
FALSE 61074 (84.5%) 10201 (80.3%)
TRUE 11190 (15.5%) 2499 (19.7%)
‘Missing’ 3 (0.00%) 6 (0.05%)
No. of Treatments in the Database: .
1 58708 (81.2%) 8533 (67.2%)
2 10087 (14.0%) 2804 (22.1%)
3 2471 (3.42%) 927 (7.30%)
4 714 (0.99%) 295 (2.32%)
5 192 (0.27%) 94 (0.74%)
6 67 (0.09%) 36 (0.28%)
7 23 (0.03%) 11 (0.09%)
8 4 (0.01%) 6 (0.05%)
10 1 (0.00%) 0 (0.00%)
Drug Dependence: 0.000
FALSE 22150 (30.7%) 1049 (8.26%)
TRUE 50116 (69.3%) 11657 (91.7%)
‘Missing’ 1 (0.00%) 0 (0.00%)
Evaluation of the Therapeutic Process: <0.001
1-High Achievement 14081 (19.5%) 2831 (22.3%)
2-Medium Achievement 21728 (30.1%) 4237 (33.3%)
3-Minimum Achievement 31549 (43.7%) 5302 (41.7%)
‘Missing’ 4909 (6.79%) 336 (2.64%)
Note. Continuous variables are presented as Medians and Percentiles 25 and 75 were shown;
Categorical variables are presented as number (%)


Of the 85,048 users, we selected 85,048 that fulfilled the conditions stated above (100%).


#Additionally, we generated a correlation plot to get an overview of heterogeneous correlations between the different variables.

#<br>
require(polycor)
#Corresponde a la apreciación clínica que hace el equipo o profesional tratante, la persona en tratamiento y su familia, del nivel alcanzado de logro de los objetivos terapéuticos planteados al inicio del proceso y descritos en el plan de tratamiento personalizado. Los criterios incluyen la evaluación del estado clínico y psicosocial al momento del egreso y una apreciación pronostica del equipo tratante.

#Computes a heterogenous correlation matrix, consisting of Pearson product-moment correlations between numeric variables, polyserial correlations between numeric and ordinal variables, and polychoric correlations between 
tiempo_antes_hetcor<-Sys.time()
hetcor_mat<-hetcor(CONS_C1_df_dup_SEP_2020_match[,-c("hash_key","row","more_one_treat","duplicates_filtered")], ML = T, std.err =T, use="pairwise.complete.obs", bins=3, pd=TRUE)
tiempo_despues_hetcor<-Sys.time()
tiempo_hetcor<-tiempo_despues_hetcor-tiempo_antes_hetcor

attr(hetcor_mat$correlations,"dimnames")[[2]][1]<-"Starting Substance"
attr(hetcor_mat$correlations,"dimnames")[[2]][2]<-"Marital Status"
attr(hetcor_mat$correlations,"dimnames")[[2]][3]<-"Educational Attainment"
attr(hetcor_mat$correlations,"dimnames")[[2]][4]<-"Age of Onset of Drug Use"
attr(hetcor_mat$correlations,"dimnames")[[2]][5]<-"Frequency of use of primary drug"
attr(hetcor_mat$correlations,"dimnames")[[2]][6]<-"Motive of Admission to Treatment"
attr(hetcor_mat$correlations,"dimnames")[[2]][7]<-"Psychiatric comorbidity"
#attr(hetcor_mat$correlations,"dimnames")[[2]][8]<-"Physical comorbidity"
attr(hetcor_mat$correlations,"dimnames")[[2]][8]<-"Chilean Region of the Center"
attr(hetcor_mat$correlations,"dimnames")[[2]][9]<-"Type of Center (Public)"
attr(hetcor_mat$correlations,"dimnames")[[2]][10]<-"Sex"
attr(hetcor_mat$correlations,"dimnames")[[2]][11]<-"Age at Admission"
attr(hetcor_mat$correlations,"dimnames")[[2]][12]<-"Date of Admission"
attr(hetcor_mat$correlations,"dimnames")[[2]][13]<-"Early Drop out"
attr(hetcor_mat$correlations,"dimnames")[[2]][14]<-"Residential Treatment"
attr(hetcor_mat$correlations,"dimnames")[[2]][15]<-"Drug Dependence"
attr(hetcor_mat$correlations,"dimnames")[[2]][16]<-"Evaluation of the Therapeutic Process"

attr(hetcor_mat$correlations,"dimnames")[[1]][1]<-"Starting Substance"
attr(hetcor_mat$correlations,"dimnames")[[1]][2]<-"Marital Status"
attr(hetcor_mat$correlations,"dimnames")[[1]][3]<-"Educational Attainment"
attr(hetcor_mat$correlations,"dimnames")[[1]][4]<-"Age of Onset of Drug Use"
attr(hetcor_mat$correlations,"dimnames")[[1]][5]<-"Frequency of use of primary drug"
attr(hetcor_mat$correlations,"dimnames")[[1]][6]<-"Motive of Admission to Treatment"
attr(hetcor_mat$correlations,"dimnames")[[1]][7]<-"Psychiatric comorbidity"
#attr(hetcor_mat$correlations,"dimnames")[[1]][8]<-"Physical comorbidity"
attr(hetcor_mat$correlations,"dimnames")[[1]][8]<-"Chilean Region of the Center"
attr(hetcor_mat$correlations,"dimnames")[[1]][9]<-"Type of Center (Public)"
attr(hetcor_mat$correlations,"dimnames")[[1]][10]<-"Sex"
attr(hetcor_mat$correlations,"dimnames")[[1]][11]<-"Age at Admission"
attr(hetcor_mat$correlations,"dimnames")[[1]][12]<-"Date of Admission"
attr(hetcor_mat$correlations,"dimnames")[[1]][13]<-"Early Drop out"
attr(hetcor_mat$correlations,"dimnames")[[1]][14]<-"Residential Treatment"
attr(hetcor_mat$correlations,"dimnames")[[1]][15]<-"Drug Dependence"
attr(hetcor_mat$correlations,"dimnames")[[1]][16]<-"Evaluation of the Therapeutic Process"

attr(hetcor_mat$tests,"dimnames")[[2]][1]<-"Starting Substance"
attr(hetcor_mat$tests,"dimnames")[[2]][2]<-"Marital Status"
attr(hetcor_mat$tests,"dimnames")[[2]][3]<-"Educational Attainment"
attr(hetcor_mat$tests,"dimnames")[[2]][4]<-"Age of Onset of Drug Use"
attr(hetcor_mat$tests,"dimnames")[[2]][5]<-"Frequency of use of primary drug"
attr(hetcor_mat$tests,"dimnames")[[2]][6]<-"Motive of Admission to Treatment"
attr(hetcor_mat$tests,"dimnames")[[2]][7]<-"Psychiatric comorbidity"
#attr(hetcor_mat$tests,"dimnames")[[2]][8]<-"Physical comorbidity"
attr(hetcor_mat$tests,"dimnames")[[2]][8]<-"Chilean Region of the Center"
attr(hetcor_mat$tests,"dimnames")[[2]][9]<-"Type of Center (Public)"
attr(hetcor_mat$tests,"dimnames")[[2]][10]<-"Sex"
attr(hetcor_mat$tests,"dimnames")[[2]][11]<-"Age at Admission"
attr(hetcor_mat$tests,"dimnames")[[2]][12]<-"Date of Admission"
attr(hetcor_mat$tests,"dimnames")[[2]][13]<-"Early Drop out"
attr(hetcor_mat$tests,"dimnames")[[2]][14]<-"Residential Treatment"
attr(hetcor_mat$tests,"dimnames")[[2]][15]<-"Drug Dependence"
attr(hetcor_mat$tests,"dimnames")[[2]][16]<-"Evaluation of the Therapeutic Process"

attr(hetcor_mat$tests,"dimnames")[[1]][1]<-"Starting Substance"
attr(hetcor_mat$tests,"dimnames")[[1]][2]<-"Marital Status"
attr(hetcor_mat$tests,"dimnames")[[1]][3]<-"Educational Attainment"
attr(hetcor_mat$tests,"dimnames")[[1]][4]<-"Age of Onset of Drug Use"
attr(hetcor_mat$tests,"dimnames")[[1]][5]<-"Frequency of use of primary drug"
attr(hetcor_mat$tests,"dimnames")[[1]][6]<-"Motive of Admission to Treatment"
attr(hetcor_mat$tests,"dimnames")[[1]][7]<-"Psychiatric comorbidity"
#attr(hetcor_mat$tests,"dimnames")[[1]][8]<-"Physical comorbidity"
attr(hetcor_mat$tests,"dimnames")[[1]][8]<-"Chilean Region of the Center"
attr(hetcor_mat$tests,"dimnames")[[1]][9]<-"Type of Center (Public)"
attr(hetcor_mat$tests,"dimnames")[[1]][10]<-"Sex"
attr(hetcor_mat$tests,"dimnames")[[1]][11]<-"Age at Admission"
attr(hetcor_mat$tests,"dimnames")[[1]][12]<-"Date of Admission"
attr(hetcor_mat$tests,"dimnames")[[1]][13]<-"Early Drop out"
attr(hetcor_mat$tests,"dimnames")[[1]][14]<-"Residential Treatment"
attr(hetcor_mat$tests,"dimnames")[[1]][15]<-"Drug Dependence"
attr(hetcor_mat$tests,"dimnames")[[1]][16]<-"Evaluation of the Therapeutic Process"

hetcor_mat$tests[is.na(hetcor_mat$tests)]<-1

ggcorrplot<-
ggcorrplot::ggcorrplot(hetcor_mat$correlations,
           ggtheme = ggplot2::theme_void,
           insig = "blank",
           pch=1,
           pch.cex=3,
           tl.srt = 45, 
           #pch="ns",
            p.mat = hetcor_mat$tests, #  replacement has 144 rows, data has 169
            #type = "lower",
           colors = c("#6D9EC1", "white", "#E46726"), 
           tl.cex=8,
           lab=F)+
  #scale_x_discrete(labels = var_lbls_p345, drop = F) +
  #scale_y_discrete(labels = var_lbls_p345, drop = F) +
  theme(axis.text.x = element_blank())+
  #theme(axis.text.y = element_text(size=7.5,color ="black", hjust = 1))+
  theme(axis.text.y = element_blank())+
  theme(legend.position="bottom")

ggplotly(ggcorrplot, height = 800, width=800)%>% 
  layout(xaxis= list(showticklabels = FALSE)) %>% 
 layout(annotations = 
 list(x = .1, y = -0.031, text = "", 
      showarrow = F, xref='paper', yref='paper', 
      #xanchor='center', yanchor='auto', xshift=0, yshift=-0,
      font=list(size=11, color="darkblue"))
 )


Imputation


We generated a plot to see all the missing values in the sample.


Figure 3. Bar plot of Porcentaje of Missing Values per Variables at Basline






From the figure above, we could see that the starting substance (sus_ini_mvv), the onset of drug use (edad_ini_cons) and the evaluation of the therapeutic process (evaluacindelprocesoteraputico) had around 6% of missing data. These values should be imputed. We first focused on the age of onset of drug use. It is important to consider that the evaluation of the therapeutic process could be distorted due to censoring (many users did not finish their treatment, and did not have this evaluation in the study period).



Age at Admission

We started looking over the missing values in the age at admission (n8). Since there were not cases with more than one treatment that had not an age of admission, we did not have to impute taking into account serial dependencies in the dates of treatment.

Figure 5. Density Estimation of Distributions of Age at Admission & Imputed Age at Admission

Figure 5. Density Estimation of Distributions of Age at Admission & Imputed Age at Admission


As seen in the Figure above, distributions seem to differ. However, considering the low amount of missing values in this variable, we proceeded with the imputation with the mean, despite the differences found. The imputed values must not be greater than the age of onset of drug use and may not be lower than 16 years old. Values lower than this age may be considered less likely to receive treatment for adult population, so it would be most probably incorrect that they would be in this database.


## [1] "Users that had more than one treatment with no date of admission:0"


Age of Onset of Drug Use

Another variable worth imputing is the Age of Onset of Drug Use (n= 6,549).


Figure 6. Density Estimation of Distributions of Age Of Onset of Drug Use & Imputed Ones

Figure 6. Density Estimation of Distributions of Age Of Onset of Drug Use & Imputed Ones


Based on the figure above, the age of onset of drug use was similar between the imputed values and the observed. However, we followed the rules stated in Duplicates process (link). There were three logical conditions to fulfill in order to replace adequately these values in the database: the age of onset must not be greater than the age of onset of drug use in the primary substance at admission (1), may not be greater than the age of admission to treatment (2), and the age of onset of drug use must be greater than 4 years old. Then, we selected the minimum value of age of onset of drug use among the imputed, because one user could not have more than one age of onset of drug use.


## [1] "Number of users that had more than one different age of onset of drug use before replacement: 0"

Figure 7. Bar plot of Percentage of Incorrect Imputed Values per Imputation Sample

## [1] "Cases with more than missing one age of onset: 515"
## [1] "Number of rows with values that did not fulfilled the conditions: 0"
## [1] "Number of rows with values that did not fulfilled the conditions after replacement with the minimum by users: 0"
## [1] "Number of users that had different age of onset of drug use after replacement: 0"



There were 0 cases of imputed ages of onset of drug use that did not fulfilled the conditions necessary to replace the missing values with the imputed ones.


Starting Substance

Then we selected the most vulnerable value among the candidates of imputations of the starting substance (First, Cocaine paste, Cocaine hydrochloride or snort cocaine, Marijuana, Alcohol, and Other).


# Ver distintos valores propuestos para sustancia de inciio
sus_ini_mod_mvv_imputed<-
 cbind.data.frame(amelia_fit$imputations$imp1$row,
       amelia_fit$imputations$imp1$sus_ini_mod_mvv,
       amelia_fit$imputations$imp2$sus_ini_mod_mvv,
       amelia_fit$imputations$imp3$sus_ini_mod_mvv,
       amelia_fit$imputations$imp4$sus_ini_mod_mvv,
       amelia_fit$imputations$imp5$sus_ini_mod_mvv,
       amelia_fit$imputations$imp6$sus_ini_mod_mvv,
       amelia_fit$imputations$imp7$sus_ini_mod_mvv,
       amelia_fit$imputations$imp8$sus_ini_mod_mvv,
       amelia_fit$imputations$imp9$sus_ini_mod_mvv,
       amelia_fit$imputations$imp10$sus_ini_mod_mvv,
       amelia_fit$imputations$imp11$sus_ini_mod_mvv,
       amelia_fit$imputations$imp12$sus_ini_mod_mvv,
       amelia_fit$imputations$imp13$sus_ini_mod_mvv,
       amelia_fit$imputations$imp14$sus_ini_mod_mvv,
       amelia_fit$imputations$imp15$sus_ini_mod_mvv,
       amelia_fit$imputations$imp16$sus_ini_mod_mvv,
       amelia_fit$imputations$imp17$sus_ini_mod_mvv,
       amelia_fit$imputations$imp18$sus_ini_mod_mvv,
       amelia_fit$imputations$imp19$sus_ini_mod_mvv,
       amelia_fit$imputations$imp20$sus_ini_mod_mvv,
       amelia_fit$imputations$imp21$sus_ini_mod_mvv,
       amelia_fit$imputations$imp22$sus_ini_mod_mvv,
       amelia_fit$imputations$imp23$sus_ini_mod_mvv,
       amelia_fit$imputations$imp24$sus_ini_mod_mvv,
       amelia_fit$imputations$imp25$sus_ini_mod_mvv,
       amelia_fit$imputations$imp26$sus_ini_mod_mvv,
       amelia_fit$imputations$imp27$sus_ini_mod_mvv,
       amelia_fit$imputations$imp28$sus_ini_mod_mvv,
       amelia_fit$imputations$imp29$sus_ini_mod_mvv,
       amelia_fit$imputations$imp30$sus_ini_mod_mvv
       ) 

sus_ini_mod_mvv_imputed<-
sus_ini_mod_mvv_imputed %>% 
  data.frame() %>% 
dplyr::mutate(across(c(amelia_fit.imputations.imp1.sus_ini_mod_mvv:amelia_fit.imputations.imp30.sus_ini_mod_mvv),~dplyr::case_when(grepl("Marijuana",as.character(.))~1,TRUE~0), .names="mar_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.sus_ini_mod_mvv:amelia_fit.imputations.imp30.sus_ini_mod_mvv),~dplyr::case_when(grepl("Alcohol",as.character(.))~1,TRUE~0), .names="oh_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.sus_ini_mod_mvv:amelia_fit.imputations.imp30.sus_ini_mod_mvv),~dplyr::case_when(grepl("Cocaine paste",as.character(.))~1,TRUE~0), .names="pb_{col}"))%>%
  dplyr::mutate(across(c(amelia_fit.imputations.imp1.sus_ini_mod_mvv:amelia_fit.imputations.imp30.sus_ini_mod_mvv),~dplyr::case_when(grepl("Cocaine hydrochloride",as.character(.))~1,TRUE~0), .names="coc_{col}"))%>%
  dplyr::mutate(across(c(amelia_fit.imputations.imp1.sus_ini_mod_mvv:amelia_fit.imputations.imp30.sus_ini_mod_mvv),~dplyr::case_when(grepl("Other",as.character(.))~1,TRUE~0), .names="otr_{col}"))%>%
        dplyr::mutate(sus_ini_mod_mvv_mar = base::rowSums(dplyr::select(., starts_with("mar_"))))%>%
  dplyr::mutate(sus_ini_mod_mvv_oh = base::rowSums(dplyr::select(., starts_with("oh_"))))%>%
  dplyr::mutate(sus_ini_mod_mvv_pb = base::rowSums(dplyr::select(., starts_with("pb_"))))%>%
  dplyr::mutate(sus_ini_mod_mvv_coc = base::rowSums(dplyr::select(., starts_with("coc_"))))%>%
  dplyr::mutate(sus_ini_mod_mvv_otr = base::rowSums(dplyr::select(., starts_with("otr_")))) %>% 
  #dplyr::summarise(min_mar=max(sus_ini_mod_mvv_mar[sus_ini_mod_mvv_mar<30]),min_oh=max(sus_ini_mod_mvv_oh[sus_ini_mod_mvv_oh<30]),min_pb=max(sus_ini_mod_mvv_pb[sus_ini_mod_mvv_pb<30]),min_coc=max(sus_ini_mod_mvv_coc[sus_ini_mod_mvv_coc<30]),min_otr=max(sus_ini_mod_mvv_otr[sus_ini_mod_mvv_otr<30]))
  dplyr::mutate(sus_ini_mod_mvv_tot=dplyr::case_when(sus_ini_mod_mvv_mar>0~1,TRUE~0)) %>% 
  dplyr::mutate(sus_ini_mod_mvv_tot=dplyr::case_when(sus_ini_mod_mvv_oh>0~sus_ini_mod_mvv_tot+1,TRUE~sus_ini_mod_mvv_tot)) %>% 
  dplyr::mutate(sus_ini_mod_mvv_tot=dplyr::case_when(sus_ini_mod_mvv_pb>0~sus_ini_mod_mvv_tot+1,TRUE~sus_ini_mod_mvv_tot)) %>% 
  dplyr::mutate(sus_ini_mod_mvv_tot=dplyr::case_when(sus_ini_mod_mvv_coc>0~sus_ini_mod_mvv_tot+1,TRUE~sus_ini_mod_mvv_tot)) %>% 
  dplyr::mutate(sus_ini_mod_mvv_tot=dplyr::case_when(sus_ini_mod_mvv_otr>0~sus_ini_mod_mvv_tot+1,TRUE~sus_ini_mod_mvv_tot)) %>% 
  dplyr::mutate(sus_ini_mod_mvv_to_imputation=dplyr::case_when(sus_ini_mod_mvv_tot==1 & sus_ini_mod_mvv_pb>0~"Cocaine paste",sus_ini_mod_mvv_tot==1 & sus_ini_mod_mvv_coc>0~"Cocaine hydrochloride",sus_ini_mod_mvv_tot==1 & sus_ini_mod_mvv_mar>0~"Marijuana",sus_ini_mod_mvv_tot==1 & sus_ini_mod_mvv_oh>0~"Alcohol",sus_ini_mod_mvv_tot==1 & sus_ini_mod_mvv_otr>0~"Other",sus_ini_mod_mvv_tot>1 & sus_ini_mod_mvv_pb>0~"Cocaine paste",sus_ini_mod_mvv_tot>1 & sus_ini_mod_mvv_coc>0~"Cocaine hydrochloride",sus_ini_mod_mvv_tot>1 & sus_ini_mod_mvv_mar>0~"Marijuana",sus_ini_mod_mvv_tot>1 & sus_ini_mod_mvv_oh>0~"Alcohol",sus_ini_mod_mvv_tot>1 & sus_ini_mod_mvv_otr>0~"Other")) %>% 
  janitor::clean_names()

sus_ini_mod_mvv_imputed<-
dplyr::select(sus_ini_mod_mvv_imputed,amelia_fit_imputations_imp1_row,sus_ini_mod_mvv_to_imputation)

#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:
CONS_C1_df_dup_SEP_2020_match_miss2<-
CONS_C1_df_dup_SEP_2020_match_miss1 %>% 
   dplyr::left_join(sus_ini_mod_mvv_imputed, by=c("row"="amelia_fit_imputations_imp1_row")) %>% 
    dplyr::mutate(sus_ini_mod_mvv=factor(dplyr::case_when(is.na(sus_ini_mod_mvv)~as.character(sus_ini_mod_mvv_to_imputation),
                                 TRUE~as.character(sus_ini_mod_mvv)))) %>% 
  dplyr::select(-sus_ini_mod_mvv_to_imputation) %>% 
  data.table()
#_#_#_#_#_#_#__#_##_#_#_#_#_#_#_#_#_#_#_#_#__#_##_#_#_#_#_##_#_#_#_#_#_#__#_##_#_#_#_#_#_#_#_#_#_#_#_#__#_##_#_#_#_#_#
#_#_#_#_#_#_#__#_##_#_#_#_#_#_#_#_#_#_#_#_#__#_##_#_#_#_#_##_#_#_#_#_#_#__#_##_#_#_#_#_#_#_#_#_#_#_#_#__#_##_#_#_#_#_#


Frequency of Use of the Primary Drug at Admission

Another variable that is worth imputing is the Frequency of use of primary drug at admission (n= 568). In case of ties, we selected the imputed values with the value with the most frequent drug use.


# Ver distintos valores propuestos para sustancia de inciio
freq_cons_sus_prin_imputed<-
 cbind.data.frame(amelia_fit$imputations$imp1$row,
       amelia_fit$imputations$imp1$freq_cons_sus_prin,
       amelia_fit$imputations$imp2$freq_cons_sus_prin,
       amelia_fit$imputations$imp3$freq_cons_sus_prin,
       amelia_fit$imputations$imp4$freq_cons_sus_prin,
       amelia_fit$imputations$imp5$freq_cons_sus_prin,
       amelia_fit$imputations$imp6$freq_cons_sus_prin,
       amelia_fit$imputations$imp7$freq_cons_sus_prin,
       amelia_fit$imputations$imp8$freq_cons_sus_prin,
       amelia_fit$imputations$imp9$freq_cons_sus_prin,
       amelia_fit$imputations$imp10$freq_cons_sus_prin,
       amelia_fit$imputations$imp11$freq_cons_sus_prin,
       amelia_fit$imputations$imp12$freq_cons_sus_prin,
       amelia_fit$imputations$imp13$freq_cons_sus_prin,
       amelia_fit$imputations$imp14$freq_cons_sus_prin,
       amelia_fit$imputations$imp15$freq_cons_sus_prin,
       amelia_fit$imputations$imp16$freq_cons_sus_prin,
       amelia_fit$imputations$imp17$freq_cons_sus_prin,
       amelia_fit$imputations$imp18$freq_cons_sus_prin,
       amelia_fit$imputations$imp19$freq_cons_sus_prin,
       amelia_fit$imputations$imp20$freq_cons_sus_prin,
       amelia_fit$imputations$imp21$freq_cons_sus_prin,
       amelia_fit$imputations$imp22$freq_cons_sus_prin,
       amelia_fit$imputations$imp23$freq_cons_sus_prin,
       amelia_fit$imputations$imp24$freq_cons_sus_prin,
       amelia_fit$imputations$imp25$freq_cons_sus_prin,
       amelia_fit$imputations$imp26$freq_cons_sus_prin,
       amelia_fit$imputations$imp27$freq_cons_sus_prin,
       amelia_fit$imputations$imp28$freq_cons_sus_prin,
       amelia_fit$imputations$imp29$freq_cons_sus_prin,
       amelia_fit$imputations$imp30$freq_cons_sus_prin
       ) 

freq_cons_sus_prin_imputed<-
freq_cons_sus_prin_imputed %>% 
  data.frame() %>% 
dplyr::mutate(across(c(amelia_fit.imputations.imp1.freq_cons_sus_prin:amelia_fit.imputations.imp30.freq_cons_sus_prin),~dplyr::case_when(grepl("1 day a week or more",as.character(.))~1,TRUE~0), .names="1_day_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.freq_cons_sus_prin:amelia_fit.imputations.imp30.freq_cons_sus_prin),~dplyr::case_when(grepl("2 to 3 days a week",as.character(.))~1,TRUE~0), .names="2_3_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.freq_cons_sus_prin:amelia_fit.imputations.imp30.freq_cons_sus_prin),~dplyr::case_when(grepl("4 to 6 days a week",as.character(.))~1,TRUE~0), .names="4_6_{col}"))%>%
  dplyr::mutate(across(c(amelia_fit.imputations.imp1.freq_cons_sus_prin:amelia_fit.imputations.imp30.freq_cons_sus_prin),~dplyr::case_when(grepl("Less than 1 day a week",as.character(.))~1,TRUE~0), .names="less_1_{col}"))%>%
  dplyr::mutate(across(c(amelia_fit.imputations.imp1.freq_cons_sus_prin:amelia_fit.imputations.imp30.freq_cons_sus_prin),~dplyr::case_when(grepl("Did not use",as.character(.))~1,TRUE~0), .names="did_not_{col}"))%>%
    dplyr::mutate(across(c(amelia_fit.imputations.imp1.freq_cons_sus_prin:amelia_fit.imputations.imp30.freq_cons_sus_prin),~dplyr::case_when(grepl("Daily",as.character(.))~1,TRUE~0), .names="daily_{col}"))%>%
  dplyr::mutate(freq_cons_sus_prin_daily = base::rowSums(dplyr::select(., starts_with("daily_")))) %>% 
  dplyr::mutate(freq_cons_sus_prin_4_6 = base::rowSums(dplyr::select(., starts_with("4_6_"))))%>%
  dplyr::mutate(freq_cons_sus_prin_2_3 = base::rowSums(dplyr::select(., starts_with("2_3_"))))%>%
  dplyr::mutate(freq_cons_sus_prin_1_day = base::rowSums(dplyr::select(., starts_with("1_day_"))))%>%
  dplyr::mutate(freq_cons_sus_prin_less_1 = base::rowSums(dplyr::select(., starts_with("less_1_"))))%>%
  dplyr::mutate(freq_cons_sus_prin_did_not = base::rowSums(dplyr::select(., starts_with("did_not_")))) %>% 
  #dplyr::summarise(min_mar=max(sus_ini_mod_mvv_mar[sus_ini_mod_mvv_mar<30]),min_oh=max(sus_ini_mod_mvv_oh[sus_ini_mod_mvv_oh<30]),min_pb=max(sus_ini_mod_mvv_pb[sus_ini_mod_mvv_pb<30]),min_coc=max(sus_ini_mod_mvv_coc[sus_ini_mod_mvv_coc<30]),min_otr=max(sus_ini_mod_mvv_otr[sus_ini_mod_mvv_otr<30]))
  dplyr::mutate(freq_cons_sus_prin_tot=dplyr::case_when(freq_cons_sus_prin_1_day>0~1,TRUE~0)) %>% 
  dplyr::mutate(freq_cons_sus_prin_tot=dplyr::case_when(freq_cons_sus_prin_2_3>0~freq_cons_sus_prin_tot+1,TRUE~freq_cons_sus_prin_tot)) %>% 
  dplyr::mutate(freq_cons_sus_prin_tot=dplyr::case_when(freq_cons_sus_prin_4_6>0~freq_cons_sus_prin_tot+1,TRUE~freq_cons_sus_prin_tot)) %>% 
  dplyr::mutate(freq_cons_sus_prin_tot=dplyr::case_when(freq_cons_sus_prin_less_1>0~freq_cons_sus_prin_tot+1,TRUE~freq_cons_sus_prin_tot)) %>% 
  dplyr::mutate(freq_cons_sus_prin_tot=dplyr::case_when(freq_cons_sus_prin_did_not>0~freq_cons_sus_prin_tot+1,TRUE~freq_cons_sus_prin_tot)) %>% 
  dplyr::mutate(freq_cons_sus_prin_tot=dplyr::case_when(freq_cons_sus_prin_daily>0~freq_cons_sus_prin_tot+1,TRUE~freq_cons_sus_prin_tot)) %>% 
  #hierarchy
  dplyr::mutate(freq_cons_sus_prin_to_imputation=
                  dplyr::case_when(freq_cons_sus_prin_tot==1 & freq_cons_sus_prin_daily>0~"Daily",
                                     freq_cons_sus_prin_tot==1 & freq_cons_sus_prin_4_6>0~"4 to 6 days a week",freq_cons_sus_prin_tot==1 & freq_cons_sus_prin_2_3>0~"2 to 3 days a week",freq_cons_sus_prin_tot==1 & freq_cons_sus_prin_1_day>0~"1 day a week or more",freq_cons_sus_prin_tot==1 & freq_cons_sus_prin_less_1>0~"Less than 1 day a week",freq_cons_sus_prin_tot==1 & freq_cons_sus_prin_did_not>0~"Did not use",freq_cons_sus_prin_tot>1 & freq_cons_sus_prin_daily>0~"Daily",freq_cons_sus_prin_tot>1 & freq_cons_sus_prin_4_6>0~"4 to 6 days a week",freq_cons_sus_prin_tot>1 & freq_cons_sus_prin_2_3>0~"2 to 3 days a week",freq_cons_sus_prin_tot>1 & freq_cons_sus_prin_1_day>0~"1 day a week or more",freq_cons_sus_prin_tot>1 & freq_cons_sus_prin_less_1>0~"Less than 1 day a week",freq_cons_sus_prin_tot>1 & freq_cons_sus_prin_did_not>0~"Did not use")) %>% 
  janitor::clean_names()

freq_cons_sus_prin_imputed<-
dplyr::select(freq_cons_sus_prin_imputed,amelia_fit_imputations_imp1_row,freq_cons_sus_prin_to_imputation)

#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:

CONS_C1_df_dup_SEP_2020_match_miss3<-
CONS_C1_df_dup_SEP_2020_match_miss2 %>% 
   dplyr::left_join(freq_cons_sus_prin_imputed, by=c("row"="amelia_fit_imputations_imp1_row")) %>% 
    dplyr::mutate(freq_cons_sus_prin=factor(dplyr::case_when(is.na(freq_cons_sus_prin)~as.character(freq_cons_sus_prin_to_imputation), TRUE~as.character(freq_cons_sus_prin)))) %>% 
  data.table()


Educational Attainment

Another variable that is worth imputing is the Educational Attainment (n= 437). we followed the rules stated in Duplicates4 process (link). We were particularly cautious to impute attainments that would follow a progression from primary school to more than high school. For this purpose, we first looked over the actual values per user, filling intermediate gaps in educational attainment in users with intermediate null values (a), we overcame with the difficulty of the incorrect imputations, by logically selecting if there were any .


# Ver distintos valores propuestos para sustancia de inciio
escolaridad_rec_imputed<-
 cbind.data.frame(amelia_fit$imputations$imp1$row,
                  amelia_fit$imputations$imp1$hash_key,
                  amelia_fit$imputations$imp1$fech_ing_num,
                  amelia_fit$imputations$imp1$escolaridad_rec,
                  amelia_fit$imputations$imp2$escolaridad_rec,
                  amelia_fit$imputations$imp3$escolaridad_rec,
                  amelia_fit$imputations$imp4$escolaridad_rec,
                  amelia_fit$imputations$imp5$escolaridad_rec,
                  amelia_fit$imputations$imp6$escolaridad_rec,
                  amelia_fit$imputations$imp7$escolaridad_rec,
                  amelia_fit$imputations$imp8$escolaridad_rec,
                  amelia_fit$imputations$imp9$escolaridad_rec,
                  amelia_fit$imputations$imp10$escolaridad_rec,
                  amelia_fit$imputations$imp11$escolaridad_rec,
                  amelia_fit$imputations$imp12$escolaridad_rec,
                  amelia_fit$imputations$imp13$escolaridad_rec,
                  amelia_fit$imputations$imp14$escolaridad_rec,
                  amelia_fit$imputations$imp15$escolaridad_rec,
                  amelia_fit$imputations$imp16$escolaridad_rec,
                  amelia_fit$imputations$imp17$escolaridad_rec,
                  amelia_fit$imputations$imp18$escolaridad_rec,
                  amelia_fit$imputations$imp19$escolaridad_rec,
                  amelia_fit$imputations$imp20$escolaridad_rec,
                  amelia_fit$imputations$imp21$escolaridad_rec,
                  amelia_fit$imputations$imp22$escolaridad_rec,
                  amelia_fit$imputations$imp23$escolaridad_rec,
                  amelia_fit$imputations$imp24$escolaridad_rec,
                  amelia_fit$imputations$imp25$escolaridad_rec,
                  amelia_fit$imputations$imp26$escolaridad_rec,
                  amelia_fit$imputations$imp27$escolaridad_rec,
                  amelia_fit$imputations$imp28$escolaridad_rec,
                  amelia_fit$imputations$imp29$escolaridad_rec,
                  amelia_fit$imputations$imp30$escolaridad_rec) 

escolaridad_rec_imputed2<-
escolaridad_rec_imputed %>% 
  data.frame() %>% 
dplyr::mutate(across(c(amelia_fit.imputations.imp1.escolaridad_rec:amelia_fit.imputations.imp30.escolaridad_rec),~dplyr::case_when(grepl("3-Completed primary school or less",as.character(.))~1,TRUE~0), .names="3_primary_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.escolaridad_rec:amelia_fit.imputations.imp30.escolaridad_rec),~dplyr::case_when(grepl("2-Completed high school or less",as.character(.))~1,TRUE~0), .names="2_high_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.escolaridad_rec:amelia_fit.imputations.imp30.escolaridad_rec),~dplyr::case_when(grepl("1-More than high school",as.character(.))~1,TRUE~0), .names="1_more_high_{col}")) %>% 

  dplyr::mutate(escolaridad_rec_3_primary = base::rowSums(dplyr::select(., contains("3_primary_")))) %>% 
  dplyr::mutate(escolaridad_rec_2_high = base::rowSums(dplyr::select(., contains("2_high_"))))%>%
  dplyr::mutate(escolaridad_rec_1_more_high = base::rowSums(dplyr::select(., contains("1_more_high_"))))

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#create an ordered index of the number of treatments by user
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

#:#:#:#;#;#;
CONS_C1_df_dup_SEP_2020_match_rn<-
    CONS_C1_df_dup_SEP_2020_match_miss %>%  #base de datos original, sin imputaciones
    dplyr::group_by(hash_key) %>% 
    dplyr::mutate(rn=row_number()) %>% 
    dplyr::ungroup() %>% 
    dplyr::select(rn)
#:#:#:#;#;#;
escolaridad_rec_imputed3<-
escolaridad_rec_imputed2 %>%   
  dplyr::left_join(cbind.data.frame(CONS_C1_df_dup_SEP_2020_match_miss$row, CONS_C1_df_dup_SEP_2020_match_miss$escolaridad_rec,CONS_C1_df_dup_SEP_2020_match_rn$rn),by=c("amelia_fit.imputations.imp1.row"="CONS_C1_df_dup_SEP_2020_match_miss$row")) %>%
  dplyr::rename("escolaridad_rec_original"="CONS_C1_df_dup_SEP_2020_match_miss$escolaridad_rec") %>%
  dplyr::mutate(escolaridad_rec_original=as.numeric(substr(escolaridad_rec_original, 1, 1))) %>%
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  #ordenar por tratamientos por usuario
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  dplyr::arrange(amelia_fit.imputations.imp1.hash_key,`CONS_C1_df_dup_SEP_2020_match_rn$rn`) %>% 
  dplyr::group_by(amelia_fit.imputations.imp1.hash_key) %>%  
  dplyr::mutate(siguiente_escolaridad_rec_original=lead(escolaridad_rec_original), 
                subsig_escolaridad_rec_original=lead(escolaridad_rec_original,n =2), 
                rn=max(`CONS_C1_df_dup_SEP_2020_match_rn$rn`),
                n_na_esc_or=is.na(escolaridad_rec_original),
                sum_n_na_esc_or=sum(n_na_esc_or,na.rm=T),
                max_sum_n_na_esc_or=max(n_na_esc_or,na.rm=T)
                ) %>% 
#dplyr::select(amelia_fit.imputations.imp1.hash_key,amelia_fit.imputations.imp30.rn,
#              siguiente_escolaridad_rec_original,escolaridad_rec_original,amelia_fit.imputations.imp1.fech_ing_num)%>% View()
  dplyr::ungroup()

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#PREPARACIÓN  BASE DE DATOS PARA IMPUTACION Y CREACIÓN DE VARIABLES PARA CONDICIONES
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
escolaridad_rec_imputed4 <-
escolaridad_rec_imputed3 %>% 
  dplyr::select(amelia_fit.imputations.imp1.hash_key,`CONS_C1_df_dup_SEP_2020_match_rn$rn`,escolaridad_rec_original,escolaridad_rec_3_primary,escolaridad_rec_2_high, escolaridad_rec_1_more_high) %>%
  dplyr::rename("hash_key"="amelia_fit.imputations.imp1.hash_key") %>% 
  dplyr::rename("treat_no_for_usr"="CONS_C1_df_dup_SEP_2020_match_rn$rn") %>% 
  dplyr::group_by(hash_key) %>% 
  dplyr::mutate(treat_per_usr=max(treat_no_for_usr,na.rm=T)) %>% 
  dplyr::ungroup() %>% 
  tidyr::pivot_wider(names_from=treat_no_for_usr,
                     #names_glue = "ord_treat_esc_{.value}",
                     values_from=c(escolaridad_rec_original,escolaridad_rec_3_primary,escolaridad_rec_2_high,escolaridad_rec_1_more_high),values_fill = NA) %>% 
#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:
#Ver si existen inconsistencias en la escolaridad, pero no sólo inconsistencias inmediatas, sino con hasta 2 espacios entre tratamientos
#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:
#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:
  dplyr::mutate(escolaridad_rec_tot_cond=dplyr::case_when(
    (escolaridad_rec_original_10>escolaridad_rec_original_9)|(escolaridad_rec_original_10>escolaridad_rec_original_8)|(escolaridad_rec_original_10>escolaridad_rec_original_7)|
      (escolaridad_rec_original_9>escolaridad_rec_original_8)|(escolaridad_rec_original_9>escolaridad_rec_original_7)|(escolaridad_rec_original_9>escolaridad_rec_original_6)|
      (escolaridad_rec_original_8>escolaridad_rec_original_7)|(escolaridad_rec_original_8>escolaridad_rec_original_6)|(escolaridad_rec_original_8>escolaridad_rec_original_5)|
      (escolaridad_rec_original_7>escolaridad_rec_original_6)|(escolaridad_rec_original_7>escolaridad_rec_original_5)|(escolaridad_rec_original_7>escolaridad_rec_original_4)|
      (escolaridad_rec_original_6>escolaridad_rec_original_5)|(escolaridad_rec_original_6>escolaridad_rec_original_4)|(escolaridad_rec_original_6>escolaridad_rec_original_3)|
      (escolaridad_rec_original_5>escolaridad_rec_original_4)|(escolaridad_rec_original_5>escolaridad_rec_original_3)|(escolaridad_rec_original_5>escolaridad_rec_original_2)|
      (escolaridad_rec_original_4>escolaridad_rec_original_3)|(escolaridad_rec_original_4>escolaridad_rec_original_2)|(escolaridad_rec_original_4>escolaridad_rec_original_1)|
      (escolaridad_rec_original_3>escolaridad_rec_original_2)|(escolaridad_rec_original_3>escolaridad_rec_original_1)|
      (escolaridad_rec_original_2>escolaridad_rec_original_1)~1,TRUE~0)) %>% 
  #dplyr::filter(escolaridad_rec_tot_cond==1) %>% #View() #0 rows ¿y 374745c85601976177fe614a7370e475?
  #dplyr::filter(treat_per_usr>1) %>% 
  #:#:#:#:##:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:
  # Ver si hay valores de escolaridad ausentes en una progresión de tratamientos
  #:#:#:#:##:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:
  dplyr::mutate(sum_nas_esc=base::rowSums(is.na(dplyr::select(., starts_with("escolaridad_rec_original_")))))%>%
  
  dplyr::mutate(escolaridad_rec_tot_nas_en_medio=dplyr::case_when(
      (sum_nas_esc>10 & treat_per_usr==10)|
      (sum_nas_esc>1 & treat_per_usr==9)|
      (sum_nas_esc>2 & treat_per_usr==8)|
      (sum_nas_esc>3 & treat_per_usr==7)|
      (sum_nas_esc>4 & treat_per_usr==6)|
      (sum_nas_esc>5 & treat_per_usr==5)|
      (sum_nas_esc>6 & treat_per_usr==4)|
      (sum_nas_esc>7 & treat_per_usr==3)|
      (sum_nas_esc>8 & treat_per_usr==2)|
      (sum_nas_esc>9 & treat_per_usr==1)~1,TRUE~0)) %>% #18b1f9646a2cd6bebd962637cff0a21a 5 casos
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  #Generar la escolaridad al final
  #:#:#:#:#:#:#:#:#
  dplyr::mutate(last_esc=dplyr::case_when(treat_per_usr==10~escolaridad_rec_original_10,
                                          treat_per_usr==9~escolaridad_rec_original_9,
                                          treat_per_usr==8~escolaridad_rec_original_8,
                                          treat_per_usr==7~escolaridad_rec_original_7,
                                          treat_per_usr==6~escolaridad_rec_original_6,
                                          treat_per_usr==5~escolaridad_rec_original_5,
                                          treat_per_usr==4~escolaridad_rec_original_4,
                                          treat_per_usr==3~escolaridad_rec_original_3,
                                          treat_per_usr==2~escolaridad_rec_original_2,
                                          treat_per_usr==1~escolaridad_rec_original_1)) %>% 
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#a0))si valor final vs. inicial son iguales, imputar todo lo que está en medio con el mismo valor
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  dplyr::mutate(escolaridad_rec_original_9=
          dplyr::case_when(escolaridad_rec_original_1==last_esc & treat_per_usr>9 & !is.na(escolaridad_rec_original_1)~escolaridad_rec_original_1,
                           TRUE~escolaridad_rec_original_9)) %>% 
  dplyr::mutate(escolaridad_rec_original_8=
          dplyr::case_when(escolaridad_rec_original_1==last_esc & treat_per_usr>8 & !is.na(escolaridad_rec_original_1)~escolaridad_rec_original_1,
                           TRUE~escolaridad_rec_original_8)) %>% 
  dplyr::mutate(escolaridad_rec_original_7=
          dplyr::case_when(escolaridad_rec_original_1==last_esc & treat_per_usr>7 & !is.na(escolaridad_rec_original_1)~escolaridad_rec_original_1,
                           TRUE~escolaridad_rec_original_7)) %>% 
  dplyr::mutate(escolaridad_rec_original_6=
          dplyr::case_when(escolaridad_rec_original_1==last_esc & treat_per_usr>6 & !is.na(escolaridad_rec_original_1)~escolaridad_rec_original_1,
                           TRUE~escolaridad_rec_original_6)) %>% 
  dplyr::mutate(escolaridad_rec_original_5=
          dplyr::case_when(escolaridad_rec_original_1==last_esc & treat_per_usr>5 & !is.na(escolaridad_rec_original_1)~escolaridad_rec_original_1,
                           TRUE~escolaridad_rec_original_5)) %>% 
  dplyr::mutate(escolaridad_rec_original_4=
          dplyr::case_when(escolaridad_rec_original_1==last_esc & treat_per_usr>4 & !is.na(escolaridad_rec_original_1)~escolaridad_rec_original_1,
                           TRUE~escolaridad_rec_original_4)) %>% 
  dplyr::mutate(escolaridad_rec_original_3=
          dplyr::case_when(escolaridad_rec_original_1==last_esc & treat_per_usr>3 & !is.na(escolaridad_rec_original_1)~escolaridad_rec_original_1,
                           TRUE~escolaridad_rec_original_3)) %>% 
  dplyr::mutate(escolaridad_rec_original_2=
          dplyr::case_when(escolaridad_rec_original_1==last_esc & treat_per_usr>2 & !is.na(escolaridad_rec_original_1)~escolaridad_rec_original_1,
                           TRUE~escolaridad_rec_original_2)) %>% 
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#a1))cambiar valores vacíos intermedios  /// fijarse en  & escolaridad_rec_tot_cond==1
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#reemplazar el número intermedio por cada tratamiento para cada usuario
  dplyr::mutate(escolaridad_rec_original_9=dplyr::case_when(escolaridad_rec_original_8==escolaridad_rec_original_10 & is.na(escolaridad_rec_original_9)&!is.na(escolaridad_rec_original_10)~escolaridad_rec_original_10,TRUE~escolaridad_rec_original_9)) %>% 
  # dplyr::filter(!is.na(escolaridad_rec_original_10)) %>% View()
  dplyr::mutate(escolaridad_rec_original_8=dplyr::case_when(escolaridad_rec_original_7==escolaridad_rec_original_9 & is.na(escolaridad_rec_original_8)&!is.na(escolaridad_rec_original_9)~escolaridad_rec_original_9,TRUE~escolaridad_rec_original_8)) %>% 
  # dplyr::filter(!is.na(escolaridad_rec_original_9)) %>% View()
  dplyr::mutate(escolaridad_rec_original_7=dplyr::case_when(escolaridad_rec_original_6==escolaridad_rec_original_8 & is.na(escolaridad_rec_original_7)&!is.na(escolaridad_rec_original_8)~escolaridad_rec_original_8 ,TRUE~escolaridad_rec_original_7)) %>% 
  # dplyr::filter(!is.na(escolaridad_rec_original_8)) %>% View()
  dplyr::mutate(escolaridad_rec_original_6=dplyr::case_when(escolaridad_rec_original_5==escolaridad_rec_original_7& is.na(escolaridad_rec_original_6)&!is.na(escolaridad_rec_original_7)~escolaridad_rec_original_7,TRUE~escolaridad_rec_original_6)) %>% 
  # dplyr::filter(!is.na(escolaridad_rec_original_7)) %>% View()
  dplyr::mutate(escolaridad_rec_original_5=dplyr::case_when(escolaridad_rec_original_4==escolaridad_rec_original_6  & is.na(escolaridad_rec_original_5)&!is.na(escolaridad_rec_original_6)~escolaridad_rec_original_6,TRUE~escolaridad_rec_original_5)) %>% 
  # dplyr::filter(!is.na(escolaridad_rec_original_6)) %>% View()
  dplyr::mutate(escolaridad_rec_original_4=dplyr::case_when(escolaridad_rec_original_3==escolaridad_rec_original_5  & is.na(escolaridad_rec_original_4)&!is.na(escolaridad_rec_original_5)~escolaridad_rec_original_5,TRUE~escolaridad_rec_original_4)) %>% 
  # dplyr::filter(!is.na(escolaridad_rec_original_5)) %>% View()
  dplyr::mutate(escolaridad_rec_original_3=dplyr::case_when(escolaridad_rec_original_2==escolaridad_rec_original_4  & is.na(escolaridad_rec_original_3)&!is.na(escolaridad_rec_original_4)~escolaridad_rec_original_4,TRUE~escolaridad_rec_original_3)) %>% 
  # dplyr::filter(!is.na(escolaridad_rec_original_4)) %>% View()
  dplyr::mutate(escolaridad_rec_original_2=dplyr::case_when(escolaridad_rec_original_1==escolaridad_rec_original_3  & is.na(escolaridad_rec_original_2)&!is.na(escolaridad_rec_original_3)~escolaridad_rec_original_3,TRUE~escolaridad_rec_original_2)) %>% 
  # dplyr::filter(!is.na(escolaridad_rec_original_3)) %>% View()
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
##a2))si tiene información en la segunda pero no en la primera, y no es un valor intermedio como secundaria completa (ya que en ese caso puede adoptar más de un valor: más o igual a ese valor), imputarlo
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  dplyr::mutate(escolaridad_rec_original_1=dplyr::case_when(escolaridad_rec_original_2==3~3,
                                                            escolaridad_rec_original_2==1~1,
                                                            TRUE~escolaridad_rec_original_1)) %>% 
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
##a3))si hay más de 2 tratamientos por usuarios, y tiene información en la segunda pero no en la primera, y es un valor intermedio pero tiene un tercer tratamiento con el mismo valor, imputarlo
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
    dplyr::mutate(escolaridad_rec_original_1=dplyr::case_when(escolaridad_rec_original_2==2 & escolaridad_rec_original_3==2~3,TRUE~escolaridad_rec_original_1))  %>% 

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#medidas para capturar inconsistencias a lo largo de todos los tratamientos de cada usuario
#escolaridad_rec_imputed4 %>% #escolaridad_rec_tot_cond
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  dplyr::mutate(across(c(escolaridad_rec_original_1:escolaridad_rec_original_10),~dplyr::case_when(.==1~1,TRUE~0), .names="1_more_high_{col}")) %>% 
  dplyr::mutate(across(c(escolaridad_rec_original_1:escolaridad_rec_original_10),~dplyr::case_when(.==2~1,TRUE~0), .names="2_high_{col}")) %>% 
  dplyr::mutate(across(c(escolaridad_rec_original_1:escolaridad_rec_original_10),~dplyr::case_when(.==3~1,TRUE~0), .names="3_primary_{col}")) %>% 
  dplyr::mutate(suma_vals_escolaridad_rec_1_more_high = base::rowSums(dplyr::select(., starts_with("1_more_high_")))) %>% 
  dplyr::mutate(suma_vals_escolaridad_rec_2_high = base::rowSums(dplyr::select(., starts_with("2_high_")))) %>% 
  dplyr::mutate(suma_vals_escolaridad_rec_3_primary = base::rowSums(dplyr::select(., starts_with("3_primary_"))))

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#IMPUTACIONES
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
escolaridad_rec_imputed5<-
escolaridad_rec_imputed4 %>% 
  #hacer una suma de más NA's de los que debería tener según la cantidad de tratamientos que tiene la persona
  #:#:#:#:#:#:#:#:#:
  dplyr::mutate(sum_nas_esc_post=base::rowSums(is.na(dplyr::select(., starts_with("escolaridad_rec_original_")))))%>%
  dplyr::mutate(escolaridad_rec_tot_nas_en_medio_post=dplyr::case_when(
      (sum_nas_esc_post>10 & treat_per_usr==10)|
      (sum_nas_esc_post>1 & treat_per_usr==9)|
      (sum_nas_esc_post>2 & treat_per_usr==8)|
      (sum_nas_esc_post>3 & treat_per_usr==7)|
      (sum_nas_esc_post>4 & treat_per_usr==6)|
      (sum_nas_esc_post>5 & treat_per_usr==5)|
      (sum_nas_esc_post>6 & treat_per_usr==4)|
      (sum_nas_esc_post>7 & treat_per_usr==3)|
      (sum_nas_esc_post>8 & treat_per_usr==2)|
      (sum_nas_esc_post>9 & treat_per_usr==1)~1,TRUE~0)) %>%
  #dplyr::filter(escolaridad_rec_tot_nas_en_medio_post>0,treat_per_usr>1)
  #d864967fa0b1c5bb1d4eb5f6a7c8c2c1
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#b0))valor inicial y sólo un tratamiento, se imputa por el valor imputado más frecuente de las 30 bases de datos
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
  dplyr::mutate(escolaridad_rec_original_1=dplyr::case_when(
    is.na(escolaridad_rec_original_1) & treat_per_usr==1 & 
      (escolaridad_rec_3_primary_1>escolaridad_rec_2_high_1)& 
      (escolaridad_rec_2_high_1>escolaridad_rec_3_primary_1)~3,
    is.na(escolaridad_rec_original_1) & treat_per_usr==1 & 
      (escolaridad_rec_2_high_1>escolaridad_rec_3_primary_1)& 
      (escolaridad_rec_2_high_1>escolaridad_rec_1_more_high_1)~2,
    is.na(escolaridad_rec_original_1) & treat_per_usr==1 & 
      (escolaridad_rec_1_more_high_1>escolaridad_rec_3_primary_1)& 
      (escolaridad_rec_1_more_high_1>escolaridad_rec_2_high_1)~1,
    TRUE~escolaridad_rec_original_1)) %>% 
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#b1))valor en el segundo tratamiento es intermedio, inicial se imputa, dependiendo si primaria es mayor que intermedio o no
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
  ###
  #dplyr::filter(is.na(escolaridad_rec_original_1),!is.na(escolaridad_rec_original_2)) %>%
  #dplyr::select(escolaridad_rec_original_1,escolaridad_rec_original_2,escolaridad_rec_3_primary_1,escolaridad_rec_2_high_1,escolaridad_rec_1_more_high_1) %>% View()
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#

  dplyr::mutate(escolaridad_rec_original_1=dplyr::case_when(
    is.na(escolaridad_rec_original_1) & escolaridad_rec_original_2==2 & (escolaridad_rec_3_primary_1>escolaridad_rec_2_high_1)~3,
    is.na(escolaridad_rec_original_1) & escolaridad_rec_original_2==2 & (escolaridad_rec_3_primary_1<escolaridad_rec_2_high_1)~2,TRUE~escolaridad_rec_original_1))%>%
    #dplyr::filter(escolaridad_rec_tot_nas_en_medio_post>0,treat_per_usr>1)
#610dd4dba4dbb62848691b6916828948
  #90d581cd11064c41b82f8e4d6ff7b70b
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#b2))Valor final es vacío, hay un valor anterior
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_ 
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_  
  dplyr::mutate(escolaridad_rec_original_10= dplyr::case_when(
  #
#si la educación en el tratamiento anterior es la máxima, imputar con el mismo valor
    treat_per_usr==10 & is.na(escolaridad_rec_original_10) &  escolaridad_rec_original_9==1~1,
    treat_per_usr==10 & is.na(escolaridad_rec_original_10) &  escolaridad_rec_original_9==1~1,
#si la educación en el tratamiento anterior es intermedio, ver cuál es el valor más creible (conserva intermedio o logra universitario)    
    treat_per_usr==10 & is.na(escolaridad_rec_original_10) &  escolaridad_rec_original_9==2 & 
      (escolaridad_rec_1_more_high_10>escolaridad_rec_2_high_10)~1,
    treat_per_usr==10 & is.na(escolaridad_rec_original_10) &  escolaridad_rec_original_9==2 & 
      (escolaridad_rec_1_more_high_10<escolaridad_rec_2_high_10)~2,
#si la educación en el tratamiento anterior es la más baja, ver cuál es el valor más creible (mantiene educación, logra intermedio o logra universitario)      
    treat_per_usr==10 & is.na(escolaridad_rec_original_10) &  escolaridad_rec_original_9==3 & 
      (escolaridad_rec_1_more_high_10>escolaridad_rec_2_high_10) & (escolaridad_rec_1_more_high_10>escolaridad_rec_3_primary_10)~1,
    treat_per_usr==10 & is.na(escolaridad_rec_original_10) &  escolaridad_rec_original_9==3 & 
        (escolaridad_rec_2_high_10 >escolaridad_rec_1_more_high_10) & (escolaridad_rec_2_high_10>escolaridad_rec_3_primary_10)~2,
    treat_per_usr==10 & is.na(escolaridad_rec_original_10) &  escolaridad_rec_original_9==3 & 
      (escolaridad_rec_3_primary_10 >escolaridad_rec_2_high_10) & (escolaridad_rec_3_primary_10>escolaridad_rec_1_more_high_10)~2,TRUE~escolaridad_rec_original_10)) %>% 
 # dplyr::filter(escolaridad_rec_tot_nas_en_medio_post>0,treat_per_usr>1)
  #
    dplyr::mutate(escolaridad_rec_original_9= dplyr::case_when(
#si la educación en el tratamiento anterior es la máxima, imputar con el mismo valor
    treat_per_usr==9 & is.na(escolaridad_rec_original_9) &  escolaridad_rec_original_8==1~1,
    treat_per_usr==9 & is.na(escolaridad_rec_original_9) &  escolaridad_rec_original_8==1~1,
#si la educación en el tratamiento anterior es intermedio, ver cuál es el valor más creible (conserva intermedio o logra universitario)    
    treat_per_usr==9 & is.na(escolaridad_rec_original_9) &  escolaridad_rec_original_8==2 & 
      (escolaridad_rec_1_more_high_9>escolaridad_rec_2_high_9)~1,
    treat_per_usr==9 & is.na(escolaridad_rec_original_9) &  escolaridad_rec_original_8==2 & 
      (escolaridad_rec_1_more_high_9<escolaridad_rec_2_high_9)~2,
#si la educación en el tratamiento anterior es la más baja, ver cuál es el valor más creible (mantiene educación, logra intermedio o logra universitario)      
    treat_per_usr==9 & is.na(escolaridad_rec_original_9) &  escolaridad_rec_original_8==3 & 
      (escolaridad_rec_1_more_high_9>escolaridad_rec_2_high_9) & (escolaridad_rec_1_more_high_9>escolaridad_rec_3_primary_9)~1,
    treat_per_usr==9 & is.na(escolaridad_rec_original_9) &  escolaridad_rec_original_8==3 & 
        (escolaridad_rec_2_high_9 >escolaridad_rec_1_more_high_9) & (escolaridad_rec_2_high_9>escolaridad_rec_3_primary_9)~2,
    treat_per_usr==9 & is.na(escolaridad_rec_original_9) &  escolaridad_rec_original_8==3 & 
      (escolaridad_rec_3_primary_9 >escolaridad_rec_2_high_9) & (escolaridad_rec_3_primary_9>escolaridad_rec_1_more_high_9)~2,TRUE~escolaridad_rec_original_9)) %>% 
  #
        dplyr::mutate(escolaridad_rec_original_8= dplyr::case_when(
#si la educación en el tratamiento anterior es la máxima, imputar con el mismo valor
    treat_per_usr==8 & is.na(escolaridad_rec_original_8) &  escolaridad_rec_original_7==1~1,
    treat_per_usr==8 & is.na(escolaridad_rec_original_8) &  escolaridad_rec_original_7==1~1,
#si la educación en el tratamiento anterior es intermedio, ver cuál es el valor más creible (conserva intermedio o logra universitario)    
    treat_per_usr==8 & is.na(escolaridad_rec_original_8) &  escolaridad_rec_original_7==2 & 
      (escolaridad_rec_1_more_high_8>escolaridad_rec_2_high_8)~1,
    treat_per_usr==8 & is.na(escolaridad_rec_original_8) &  escolaridad_rec_original_7==2 & 
      (escolaridad_rec_1_more_high_8<escolaridad_rec_2_high_8)~2,
#si la educación en el tratamiento anterior es la más baja, ver cuál es el valor más creible (mantiene educación, logra intermedio o logra universitario)      
    treat_per_usr==8 & is.na(escolaridad_rec_original_8) &  escolaridad_rec_original_7==3 & 
      (escolaridad_rec_1_more_high_8>escolaridad_rec_2_high_8) & (escolaridad_rec_1_more_high_8>escolaridad_rec_3_primary_8)~1,
    treat_per_usr==8 & is.na(escolaridad_rec_original_8) &  escolaridad_rec_original_7==3 & 
        (escolaridad_rec_2_high_8 >escolaridad_rec_1_more_high_8) & (escolaridad_rec_2_high_8>escolaridad_rec_3_primary_8)~2,
    treat_per_usr==8 & is.na(escolaridad_rec_original_8) &  escolaridad_rec_original_7==3 & 
      (escolaridad_rec_3_primary_8 >escolaridad_rec_2_high_8) & (escolaridad_rec_3_primary_8>escolaridad_rec_1_more_high_8)~2,TRUE~escolaridad_rec_original_8)) %>% 
  #
        dplyr::mutate(escolaridad_rec_original_7= dplyr::case_when(
          #si la educación en el tratamiento anterior es la máxima, imputar con el mismo valor
    treat_per_usr==7 & is.na(escolaridad_rec_original_7) &  escolaridad_rec_original_6==1~1,
    treat_per_usr==7 & is.na(escolaridad_rec_original_7) &  escolaridad_rec_original_6==1~1,
#si la educación en el tratamiento anterior es intermedio, ver cuál es el valor más creible (conserva intermedio o logra universitario)    
    treat_per_usr==7 & is.na(escolaridad_rec_original_7) &  escolaridad_rec_original_6==2 & 
      (escolaridad_rec_1_more_high_7>escolaridad_rec_2_high_7)~1,
    treat_per_usr==7 & is.na(escolaridad_rec_original_7) &  escolaridad_rec_original_6==2 & 
      (escolaridad_rec_1_more_high_7<escolaridad_rec_2_high_7)~2,
#si la educación en el tratamiento anterior es la más baja, ver cuál es el valor más creible (mantiene educación, logra intermedio o logra universitario)      
    treat_per_usr==7 & is.na(escolaridad_rec_original_7) &  escolaridad_rec_original_6==3 & 
      (escolaridad_rec_1_more_high_7>escolaridad_rec_2_high_7) & (escolaridad_rec_1_more_high_7>escolaridad_rec_3_primary_7)~1,
    treat_per_usr==7 & is.na(escolaridad_rec_original_7) &  escolaridad_rec_original_6==3 & 
        (escolaridad_rec_2_high_7 >escolaridad_rec_1_more_high_7) & (escolaridad_rec_2_high_7>escolaridad_rec_3_primary_7)~2,
    treat_per_usr==7 & is.na(escolaridad_rec_original_7) &  escolaridad_rec_original_6==3 & 
      (escolaridad_rec_3_primary_7 >escolaridad_rec_2_high_7) & (escolaridad_rec_3_primary_7>escolaridad_rec_1_more_high_7)~2,TRUE~escolaridad_rec_original_7)) %>% 
  #
          dplyr::mutate(escolaridad_rec_original_6= dplyr::case_when(
#si la educación en el tratamiento anterior es la máxima, imputar con el mismo valor
    treat_per_usr==6 & is.na(escolaridad_rec_original_6) &  escolaridad_rec_original_5==1~1,
    treat_per_usr==6 & is.na(escolaridad_rec_original_6) &  escolaridad_rec_original_5==1~1,
#si la educación en el tratamiento anterior es intermedio, ver cuál es el valor más creible (conserva intermedio o logra universitario)    
    treat_per_usr==6 & is.na(escolaridad_rec_original_6) &  escolaridad_rec_original_5==2 & 
      (escolaridad_rec_1_more_high_6>escolaridad_rec_2_high_6)~1,
    treat_per_usr==6 & is.na(escolaridad_rec_original_6) &  escolaridad_rec_original_5==2 & 
      (escolaridad_rec_1_more_high_6<escolaridad_rec_2_high_6)~2,
#si la educación en el tratamiento anterior es la más baja, ver cuál es el valor más creible (mantiene educación, logra intermedio o logra universitario)      
    treat_per_usr==6 & is.na(escolaridad_rec_original_6) &  escolaridad_rec_original_5==3 & 
      (escolaridad_rec_1_more_high_6>escolaridad_rec_2_high_6) & (escolaridad_rec_1_more_high_6>escolaridad_rec_3_primary_6)~1,
    treat_per_usr==6 & is.na(escolaridad_rec_original_6) &  escolaridad_rec_original_5==3 & 
        (escolaridad_rec_2_high_6 >escolaridad_rec_1_more_high_6) & (escolaridad_rec_2_high_6>escolaridad_rec_3_primary_6)~2,
    treat_per_usr==6 & is.na(escolaridad_rec_original_6) &  escolaridad_rec_original_5==3 & 
      (escolaridad_rec_3_primary_6 >escolaridad_rec_2_high_6) & (escolaridad_rec_3_primary_6>escolaridad_rec_1_more_high_6)~2,TRUE~escolaridad_rec_original_6)) %>% 
  #
          dplyr::mutate(escolaridad_rec_original_5= dplyr::case_when(
#si la educación en el tratamiento anterior es la máxima, imputar con el mismo valor
    treat_per_usr==5 & is.na(escolaridad_rec_original_5) &  escolaridad_rec_original_4==1~1,
    treat_per_usr==5 & is.na(escolaridad_rec_original_5) &  escolaridad_rec_original_4==1~1,
#si la educación en el tratamiento anterior es intermedio, ver cuál es el valor más creible (conserva intermedio o logra universitario)    
    treat_per_usr==5 & is.na(escolaridad_rec_original_5) &  escolaridad_rec_original_4==2 & 
      (escolaridad_rec_1_more_high_5>escolaridad_rec_2_high_5)~1,
    treat_per_usr==5 & is.na(escolaridad_rec_original_5) &  escolaridad_rec_original_4==2 & 
      (escolaridad_rec_1_more_high_5<escolaridad_rec_2_high_5)~2,
#si la educación en el tratamiento anterior es la más baja, ver cuál es el valor más creible (mantiene educación, logra intermedio o logra universitario)      
    treat_per_usr==5 & is.na(escolaridad_rec_original_5) &  escolaridad_rec_original_4==3 & 
      (escolaridad_rec_1_more_high_5>escolaridad_rec_2_high_5) & (escolaridad_rec_1_more_high_5>escolaridad_rec_3_primary_5)~1,
    treat_per_usr==5 & is.na(escolaridad_rec_original_5) &  escolaridad_rec_original_4==3 & 
        (escolaridad_rec_2_high_5 >escolaridad_rec_1_more_high_5) & (escolaridad_rec_2_high_5>escolaridad_rec_3_primary_5)~2,
    treat_per_usr==5 & is.na(escolaridad_rec_original_5) &  escolaridad_rec_original_4==3 & 
      (escolaridad_rec_3_primary_5 >escolaridad_rec_2_high_5) & (escolaridad_rec_3_primary_5>escolaridad_rec_1_more_high_5)~2,TRUE~escolaridad_rec_original_5)) %>% 
  #
          dplyr::mutate(escolaridad_rec_original_4= dplyr::case_when(
#si la educación en el tratamiento anterior es la máxima, imputar con el mismo valor
    treat_per_usr==4 & is.na(escolaridad_rec_original_4) &  escolaridad_rec_original_3==1~1,
    treat_per_usr==4 & is.na(escolaridad_rec_original_4) &  escolaridad_rec_original_3==1~1,
#si la educación en el tratamiento anterior es intermedio, ver cuál es el valor más creible (conserva intermedio o logra universitario)    
    treat_per_usr==4 & is.na(escolaridad_rec_original_4) &  escolaridad_rec_original_3==2 & 
      (escolaridad_rec_1_more_high_4>escolaridad_rec_2_high_4)~1,
    treat_per_usr==4 & is.na(escolaridad_rec_original_4) &  escolaridad_rec_original_3==2 & 
      (escolaridad_rec_1_more_high_4<escolaridad_rec_2_high_4)~2,
#si la educación en el tratamiento anterior es la más baja, ver cuál es el valor más creible (mantiene educación, logra intermedio o logra universitario)      
    treat_per_usr==4 & is.na(escolaridad_rec_original_4) &  escolaridad_rec_original_3==3 & 
      (escolaridad_rec_1_more_high_4>escolaridad_rec_2_high_4) & (escolaridad_rec_1_more_high_4>escolaridad_rec_3_primary_4)~1,
    treat_per_usr==4 & is.na(escolaridad_rec_original_4) &  escolaridad_rec_original_3==3 & 
        (escolaridad_rec_2_high_4 >escolaridad_rec_1_more_high_4) & (escolaridad_rec_2_high_4>escolaridad_rec_3_primary_4)~2,
    treat_per_usr==4 & is.na(escolaridad_rec_original_4) &  escolaridad_rec_original_3==3 & 
      (escolaridad_rec_3_primary_4 >escolaridad_rec_2_high_4) & (escolaridad_rec_3_primary_4>escolaridad_rec_1_more_high_4)~2,TRUE~escolaridad_rec_original_4)) %>% 
  #
          dplyr::mutate(escolaridad_rec_original_3= dplyr::case_when(
#si la educación en el tratamiento anterior es la máxima, imputar con el mismo valor
    treat_per_usr==3 & is.na(escolaridad_rec_original_3) &  escolaridad_rec_original_3==1~1,
    treat_per_usr==3 & is.na(escolaridad_rec_original_3) &  escolaridad_rec_original_3==1~1,
#si la educación en el tratamiento anterior es intermedio, ver cuál es el valor más creible (conserva intermedio o logra universitario)    
    treat_per_usr==3 & is.na(escolaridad_rec_original_3) &  escolaridad_rec_original_3==2 & 
      (escolaridad_rec_1_more_high_3>escolaridad_rec_2_high_3)~1,
    treat_per_usr==3 & is.na(escolaridad_rec_original_3) &  escolaridad_rec_original_3==2 & 
      (escolaridad_rec_1_more_high_3<escolaridad_rec_2_high_3)~2,
#si la educación en el tratamiento anterior es la más baja, ver cuál es el valor más creible (mantiene educación, logra intermedio o logra universitario)      
    treat_per_usr==3 & is.na(escolaridad_rec_original_3) &  escolaridad_rec_original_2==3 & 
      (escolaridad_rec_1_more_high_3>escolaridad_rec_2_high_3) & (escolaridad_rec_1_more_high_3>escolaridad_rec_3_primary_3)~1,
    treat_per_usr==3 & is.na(escolaridad_rec_original_3) &  escolaridad_rec_original_2==3 & 
        (escolaridad_rec_2_high_3 >escolaridad_rec_1_more_high_3) & (escolaridad_rec_2_high_3>escolaridad_rec_3_primary_3)~2,
    treat_per_usr==3 & is.na(escolaridad_rec_original_3) &  escolaridad_rec_original_2==3 & 
      (escolaridad_rec_3_primary_3 >escolaridad_rec_2_high_3) & (escolaridad_rec_3_primary_3>escolaridad_rec_1_more_high_3)~2,TRUE~escolaridad_rec_original_3))
#:#:#:#:
 # dplyr::filter(escolaridad_rec_tot_nas_en_medio_post>0,treat_per_usr>1)
 #:#:#:#:
  #comprobar si en verdad calza:
  #%>%dplyr::filter(hash_key=="ef4325cda7ddd92f6218bb910c3e0895") %>% dplyr::select(escolaridad_rec_original_1,escolaridad_rec_original_2,treat_per_usr,escolaridad_rec_3_primary_1,escolaridad_rec_2_high_1)
  #610dd4dba4dbb62848691b6916828948
  #90d581cd11064c41b82f8e4d6ff7b70b
#escolaridad_rec_imputed5 %>% 
#    dplyr::filter(escolaridad_rec_tot_nas_en_medio_post>0,treat_per_usr>1)%>%dplyr::filter(hash_key=="98d6644d995ea2c8777a683160728004") %>% dplyr::select(escolaridad_rec_original_3,escolaridad_rec_original_4,escolaridad_rec_original_4,treat_per_usr,escolaridad_rec_3_primary_4,escolaridad_rec_2_high_4,escolaridad_rec_1_more_high_4)

#98d6644d995ea2c8777a683160728004
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#b2))Valor final es vacío, hay un valor anterior
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_ 
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_  
escolaridad_rec_imputed6<-
escolaridad_rec_imputed5 %>% 
#dplyr::filter(escolaridad_rec_tot_nas_en_medio_post>0,treat_per_usr>1)%>%dplyr::filter(hash_key=="98d6644d995ea2c8777a683160728004") %>% dplyr::select(escolaridad_rec_original_4,escolaridad_rec_original_4,treat_per_usr,escolaridad_rec_3_primary_4,escolaridad_rec_2_high_4,escolaridad_rec_1_more_high_3)
  dplyr::select(hash_key,starts_with("escolaridad_rec_original_")) %>%
  tidyr::pivot_longer(cols = starts_with("escolaridad_rec_original_"),
   names_to = "rn",
   names_prefix = "escolaridad_rec_original_") %>% 
  dplyr::filter(!is.na(value)) %>% 
  dplyr::mutate(hash_rn=paste0(hash_key,"_",rn)) %>% 
  dplyr::select(hash_rn,value)
#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:
#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:
CONS_C1_df_dup_SEP_2020_match_miss4<-
CONS_C1_df_dup_SEP_2020_match_miss3 %>%
  dplyr::group_by(hash_key) %>% 
  dplyr::mutate(rn=row_number()) %>% 
  dplyr::ungroup() %>% 
  dplyr::mutate(hash_rn=paste0(hash_key,"_",rn)) %>% 
  dplyr::left_join(escolaridad_rec_imputed6, by=c("hash_rn")) %>% 
  dplyr::mutate(escolaridad_rec=dplyr::case_when(value==1~"1-More than high school",value==2~"2-Completed high school or less",value==3~"3-Completed primary school or less")) %>% 
  #
  dplyr::arrange(hash_key,rn) %>% 
  #dplyr::mutate(escolaridad_rec=dplyr::case_when(is.na(escolaridad_rec)~value,TRUE~as.character(escolaridad_rec))) %>% 
  dplyr::mutate(escolaridad_rec=parse_factor(as.character(escolaridad_rec),levels=c('3-Completed primary school or less', '2-Completed high school or less', '1-More than high school'), ordered =F,trim_ws=T,include_na =F, locale=locale(encoding = "Latin1"))) %>%
  dplyr::select(-value,-hash_rn) %>% 
  data.table()

#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:
#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:
paste("Check inconsistencies with posterior educational attainments (0= No inconsistencies):",CONS_C1_df_dup_SEP_2020_match_miss4 %>% 
  dplyr::arrange(hash_key,rn) %>% 
  dplyr::group_by(hash_key) %>% 
  dplyr::mutate(escolaridad_rec_num=as.numeric(substr(escolaridad_rec, 1, 1)),
                sig_escolaridad_rec_num=lead(escolaridad_rec_num),
                ant_escolaridad_rec_num=lag(escolaridad_rec_num)) %>% 
  dplyr::ungroup() %>% 
  dplyr::filter(escolaridad_rec_num>ant_escolaridad_rec_num) %>% 
  dplyr::select(hash_key,rn,fech_ing_num, escolaridad_rec, escolaridad_rec_num, sig_escolaridad_rec_num,ant_escolaridad_rec_num) %>% 
  nrow())
## [1] "Check inconsistencies with posterior educational attainments (0= No inconsistencies): 0"


We ended having 257 missing values in educational attainment (users=254), because the imputed values did not fulfilled the requirements of a progression of the educational attainment (eg., a user could not respond to have completed secondary school, but then answer that he had completed primary school only), for example, due to ties in the imputed values or no imputed values.


Marital status

Additionally, we replaced missing values of the marital status (n=198). Since different marital status were not particularly more vulnerable between each other, we selected the most frequent imputed value among the different imputed databases.


# Ver distintos valores propuestos para estado conyugal
estado_conyugal_2_imputed<-
 cbind.data.frame(amelia_fit$imputations$imp1$row,
       amelia_fit$imputations$imp1$estado_conyugal_2,
       amelia_fit$imputations$imp2$estado_conyugal_2,
       amelia_fit$imputations$imp3$estado_conyugal_2,
       amelia_fit$imputations$imp4$estado_conyugal_2,
       amelia_fit$imputations$imp5$estado_conyugal_2,
       amelia_fit$imputations$imp6$estado_conyugal_2,
       amelia_fit$imputations$imp7$estado_conyugal_2,
       amelia_fit$imputations$imp8$estado_conyugal_2,
       amelia_fit$imputations$imp9$estado_conyugal_2,
       amelia_fit$imputations$imp10$estado_conyugal_2,
       amelia_fit$imputations$imp11$estado_conyugal_2,
       amelia_fit$imputations$imp12$estado_conyugal_2,
       amelia_fit$imputations$imp13$estado_conyugal_2,
       amelia_fit$imputations$imp14$estado_conyugal_2,
       amelia_fit$imputations$imp15$estado_conyugal_2,
       amelia_fit$imputations$imp16$estado_conyugal_2,
       amelia_fit$imputations$imp17$estado_conyugal_2,
       amelia_fit$imputations$imp18$estado_conyugal_2,
       amelia_fit$imputations$imp19$estado_conyugal_2,
       amelia_fit$imputations$imp20$estado_conyugal_2,
       amelia_fit$imputations$imp21$estado_conyugal_2,
       amelia_fit$imputations$imp22$estado_conyugal_2,
       amelia_fit$imputations$imp23$estado_conyugal_2,
       amelia_fit$imputations$imp24$estado_conyugal_2,
       amelia_fit$imputations$imp25$estado_conyugal_2,
       amelia_fit$imputations$imp26$estado_conyugal_2,
       amelia_fit$imputations$imp27$estado_conyugal_2,
       amelia_fit$imputations$imp28$estado_conyugal_2,
       amelia_fit$imputations$imp29$estado_conyugal_2,
       amelia_fit$imputations$imp30$estado_conyugal_2
       ) 

estado_conyugal_2_imputed<-
estado_conyugal_2_imputed %>% 
  data.frame() %>% 
dplyr::mutate(across(c(amelia_fit.imputations.imp1.estado_conyugal_2:amelia_fit.imputations.imp30.estado_conyugal_2),~dplyr::case_when(grepl("Married/Shared living arrangements",as.character(.))~1,TRUE~0), .names="married_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.estado_conyugal_2:amelia_fit.imputations.imp30.estado_conyugal_2),~dplyr::case_when(grepl("Separated/Divorced",as.character(.))~1,TRUE~0), .names="sep_div_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.estado_conyugal_2:amelia_fit.imputations.imp30.estado_conyugal_2),~dplyr::case_when(grepl("Single",as.character(.))~1,TRUE~0), .names="singl_{col}"))%>%
  dplyr::mutate(across(c(amelia_fit.imputations.imp1.estado_conyugal_2:amelia_fit.imputations.imp30.estado_conyugal_2),~dplyr::case_when(grepl("Widower",as.character(.))~1,TRUE~0), .names="widow_{col}"))%>%
 
  dplyr::mutate(estado_conyugal_2_married = base::rowSums(dplyr::select(., starts_with("married_"))))%>%
  dplyr::mutate(estado_conyugal_2_sep_div = base::rowSums(dplyr::select(., starts_with("sep_div_"))))%>%
  dplyr::mutate(estado_conyugal_2_singl = base::rowSums(dplyr::select(., starts_with("singl_"))))%>%
  dplyr::mutate(estado_conyugal_2_wid = base::rowSums(dplyr::select(., starts_with("widow_"))))%>%
  #dplyr::summarise(min_mar=max(sus_ini_mod_mvv_mar[sus_ini_mod_mvv_mar<30]),min_oh=max(sus_ini_mod_mvv_oh[sus_ini_mod_mvv_oh<30]),min_pb=max(sus_ini_mod_mvv_pb[sus_ini_mod_mvv_pb<30]),min_coc=max(sus_ini_mod_mvv_coc[sus_ini_mod_mvv_coc<30]),min_otr=max(sus_ini_mod_mvv_otr[sus_ini_mod_mvv_otr<30]))
  dplyr::mutate(estado_conyugal_2_tot=dplyr::case_when(estado_conyugal_2_married>0~1,TRUE~0)) %>% 
  dplyr::mutate(estado_conyugal_2_tot=dplyr::case_when(estado_conyugal_2_sep_div>0~estado_conyugal_2_tot+1,TRUE~estado_conyugal_2_tot)) %>% 
  dplyr::mutate(estado_conyugal_2_tot=dplyr::case_when(estado_conyugal_2_singl>0~estado_conyugal_2_tot+1,TRUE~estado_conyugal_2_tot)) %>% 
  dplyr::mutate(estado_conyugal_2_tot=dplyr::case_when(estado_conyugal_2_wid>0~estado_conyugal_2_tot+1,TRUE~estado_conyugal_2_tot)) %>% 
  janitor::clean_names()
  
estado_conyugal_2_imputed_cat_est_cony<-  
    estado_conyugal_2_imputed %>%
        tidyr::pivot_longer(c(estado_conyugal_2_married, estado_conyugal_2_sep_div, estado_conyugal_2_singl, estado_conyugal_2_wid), names_to = "cat_est_conyugal", values_to = "count") %>%
        dplyr::group_by(amelia_fit_imputations_imp1_row) %>% 
        dplyr::mutate(estado_conyugal_2_imputed_max=max(count,na.rm=T)) %>% 
        dplyr::ungroup() %>% 
        dplyr::filter(estado_conyugal_2_imputed_max==count) %>% 
        dplyr::select(amelia_fit_imputations_imp1_row,cat_est_conyugal,count) %>% 
        dplyr::group_by(amelia_fit_imputations_imp1_row) %>% 
        dplyr::mutate(n_row=n()) %>% 
        dplyr::ungroup() %>% 
        dplyr::mutate(cat_est_conyugal=dplyr::case_when(n_row>1~NA_character_,
                                                        TRUE~cat_est_conyugal)) %>% 
        dplyr::distinct(amelia_fit_imputations_imp1_row,.keep_all = T)
  
estado_conyugal_2_imputed<-
  estado_conyugal_2_imputed %>% 
    dplyr::left_join(estado_conyugal_2_imputed_cat_est_cony, by="amelia_fit_imputations_imp1_row") %>%
    dplyr::mutate(cat_est_conyugal=dplyr::case_when(cat_est_conyugal=="estado_conyugal_2_married"~"Married/Shared living arrangements",cat_est_conyugal=="estado_conyugal_2_sep_div"~"Separated/Divorced",cat_est_conyugal=="estado_conyugal_2_singl"~"Single",cat_est_conyugal=="estado_conyugal_2_wid"~"Widower"
    ))%>% 
  janitor::clean_names()

#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:

CONS_C1_df_dup_SEP_2020_match_miss5<-
CONS_C1_df_dup_SEP_2020_match_miss4 %>% 
   dplyr::left_join(dplyr::select(estado_conyugal_2_imputed,amelia_fit_imputations_imp1_row,cat_est_conyugal), by=c("row"="amelia_fit_imputations_imp1_row")) %>% 
    dplyr::mutate(estado_conyugal_2=factor(dplyr::case_when(is.na(estado_conyugal_2)~as.character(cat_est_conyugal),TRUE~as.character(estado_conyugal_2)))) %>% 
  data.table()

no_calzaron_estado_cony<-
CONS_C1_df_dup_SEP_2020_match_miss5 %>% dplyr::filter(is.na(estado_conyugal_2)) %>% dplyr::distinct(hash_key) %>% unlist()

#CONS_C1_df_dup_SEP_2020_match_miss5 %>% 
#dplyr::filter(hash_key %in% CONS_C1_df_dup_SEP_2020_match_miss5 %>% dplyr::filter(is.na(estado_conyugal_2)) %>% dplyr::distinct(hash_key) %>% unlist())


We could not resolve Marital status in 14 cases due to ties in the most frequent values.


Region & Type of Center (Public)

We looked over possible imputations to region of the center (n=28) and type of the center (public or private) (n=28).


# Ver distintos valores propuestos para estado conyugal
#evaluacindelprocesoteraputico nombre_region tipo_centro_pub

#no hay información. debemos imputar
no_mostrar=0
if (no_mostrar==1){
tipo_centro_nombre_region_nas_nombre_region<-
CONS_C1_df_dup_SEP_2020 %>% 
    #dplyr::filter(row %in% unlist(unique(CONS_C1_df_dup_SEP_2020_match[,"row"]))) %>% 
    dplyr::filter(is.na(nombre_region)) %>% 
    janitor::tabyl(tipo_centro, nombre_region) 
}

nombre_region_imputed<-
 cbind.data.frame(amelia_fit$imputations$imp1$row,
       amelia_fit$imputations$imp1$nombre_region,
       amelia_fit$imputations$imp2$nombre_region,
       amelia_fit$imputations$imp3$nombre_region,
       amelia_fit$imputations$imp4$nombre_region,
       amelia_fit$imputations$imp5$nombre_region,
       amelia_fit$imputations$imp6$nombre_region,
       amelia_fit$imputations$imp7$nombre_region,
       amelia_fit$imputations$imp8$nombre_region,
       amelia_fit$imputations$imp9$nombre_region,
       amelia_fit$imputations$imp10$nombre_region,
       amelia_fit$imputations$imp11$nombre_region,
       amelia_fit$imputations$imp12$nombre_region,
       amelia_fit$imputations$imp13$nombre_region,
       amelia_fit$imputations$imp14$nombre_region,
       amelia_fit$imputations$imp15$nombre_region,
       amelia_fit$imputations$imp16$nombre_region,
       amelia_fit$imputations$imp17$nombre_region,
       amelia_fit$imputations$imp18$nombre_region,
       amelia_fit$imputations$imp19$nombre_region,
       amelia_fit$imputations$imp20$nombre_region,
       amelia_fit$imputations$imp21$nombre_region,
       amelia_fit$imputations$imp22$nombre_region,
       amelia_fit$imputations$imp23$nombre_region,
       amelia_fit$imputations$imp24$nombre_region,
       amelia_fit$imputations$imp25$nombre_region,
       amelia_fit$imputations$imp26$nombre_region,
       amelia_fit$imputations$imp27$nombre_region,
       amelia_fit$imputations$imp28$nombre_region,
       amelia_fit$imputations$imp29$nombre_region,
       amelia_fit$imputations$imp30$nombre_region
       ) 
nombre_region_imputed<-
nombre_region_imputed %>% 
  data.frame() %>% 
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("Antofagasta",as.character(.))~1,TRUE~0), .names="reg_02_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("Araucan",as.character(.))~1,TRUE~0), .names="reg_09_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("Arica",as.character(.))~1,TRUE~0), .names="reg_15_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("Atacama",as.character(.))~1,TRUE~0), .names="reg_03_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("Ays",as.character(.))~1,TRUE~0), .names="reg_11_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("Biob",as.character(.))~1,TRUE~0), .names="reg_08_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("Coquimbo",as.character(.))~1,TRUE~0), .names="reg_04_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("Los Lagos",as.character(.))~1,TRUE~0), .names="reg_10_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("Los R",as.character(.))~1,TRUE~0), .names="reg_14_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("Magallanes",as.character(.))~1,TRUE~0), .names="reg_12_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("Maule",as.character(.))~1,TRUE~0), .names="reg_07_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("Metropolitana",as.character(.))~1,TRUE~0), .names="reg_13_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("uble",as.character(.))~1,TRUE~0), .names="reg_16_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("Higgins",as.character(.))~1,TRUE~0), .names="reg_06_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("Tarapac",as.character(.))~1,TRUE~0), .names="reg_01_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("Valpara",as.character(.))~1,TRUE~0), .names="reg_05_{col}"))%>%
  
 
  dplyr::mutate(nombre_region_02 = base::rowSums(dplyr::select(., starts_with("reg_02_"))))%>%
  dplyr::mutate(nombre_region_09 = base::rowSums(dplyr::select(., starts_with("reg_09_"))))%>%
  dplyr::mutate(nombre_region_15 = base::rowSums(dplyr::select(., starts_with("reg_15_"))))%>%
  dplyr::mutate(nombre_region_03 = base::rowSums(dplyr::select(., starts_with("reg_03_"))))%>%
  dplyr::mutate(nombre_region_11 = base::rowSums(dplyr::select(., starts_with("reg_11_"))))%>%
  dplyr::mutate(nombre_region_08 = base::rowSums(dplyr::select(., starts_with("reg_08_"))))%>%
  dplyr::mutate(nombre_region_04 = base::rowSums(dplyr::select(., starts_with("reg_04_"))))%>%
  dplyr::mutate(nombre_region_10 = base::rowSums(dplyr::select(., starts_with("reg_10_"))))%>%
  dplyr::mutate(nombre_region_14 = base::rowSums(dplyr::select(., starts_with("reg_14_"))))%>%
  dplyr::mutate(nombre_region_12 = base::rowSums(dplyr::select(., starts_with("reg_12_"))))%>%
  dplyr::mutate(nombre_region_07 = base::rowSums(dplyr::select(., starts_with("reg_07_"))))%>%
  dplyr::mutate(nombre_region_13 = base::rowSums(dplyr::select(., starts_with("reg_13_"))))%>%
  dplyr::mutate(nombre_region_16 = base::rowSums(dplyr::select(., starts_with("reg_16_"))))%>%
  dplyr::mutate(nombre_region_06 = base::rowSums(dplyr::select(., starts_with("reg_06_"))))%>%
  dplyr::mutate(nombre_region_01 = base::rowSums(dplyr::select(., starts_with("reg_01_"))))%>%
  dplyr::mutate(nombre_region_05 = base::rowSums(dplyr::select(., starts_with("reg_05_"))))%>%
  #dplyr::summarise(min_mar=max(sus_ini_mod_mvv_mar[sus_ini_mod_mvv_mar<30]),min_oh=max(sus_ini_mod_mvv_oh[sus_ini_mod_mvv_oh<30]),min_pb=max(sus_ini_mod_mvv_pb[sus_ini_mod_mvv_pb<30]),min_coc=max(sus_ini_mod_mvv_coc[sus_ini_mod_mvv_coc<30]),min_otr=max(sus_ini_mod_mvv_otr[sus_ini_mod_mvv_otr<30]))
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_02>0~1,TRUE~0)) %>% 
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_09>0~nombre_region_tot+1,TRUE~nombre_region_tot)) %>% 
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_15>0~nombre_region_tot+1,TRUE~nombre_region_tot)) %>% 
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_03>0~nombre_region_tot+1,TRUE~nombre_region_tot)) %>%
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_11>0~nombre_region_tot+1,TRUE~nombre_region_tot)) %>% 
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_08>0~nombre_region_tot+1,TRUE~nombre_region_tot)) %>% 
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_04>0~nombre_region_tot+1,TRUE~nombre_region_tot)) %>% 
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_10>0~nombre_region_tot+1,TRUE~nombre_region_tot)) %>% 
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_14>0~nombre_region_tot+1,TRUE~nombre_region_tot)) %>% 
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_12>0~nombre_region_tot+1,TRUE~nombre_region_tot)) %>% 
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_07>0~nombre_region_tot+1,TRUE~nombre_region_tot)) %>% 
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_13>0~nombre_region_tot+1,TRUE~nombre_region_tot)) %>% 
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_16>0~nombre_region_tot+1,TRUE~nombre_region_tot)) %>% 
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_06>0~nombre_region_tot+1,TRUE~nombre_region_tot)) %>% 
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_01>0~nombre_region_tot+1,TRUE~nombre_region_tot)) %>% 
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_05>0~nombre_region_tot+1,TRUE~nombre_region_tot)) %>% 
  janitor::clean_names()
  
nombre_region_imputed_cat_reg<-  
    nombre_region_imputed %>%
        tidyr::pivot_longer(c(nombre_region_01, nombre_region_02, nombre_region_03, nombre_region_04, nombre_region_05, nombre_region_06, nombre_region_07, nombre_region_08, nombre_region_09, nombre_region_10, nombre_region_11, nombre_region_12, nombre_region_13, nombre_region_14, nombre_region_15), names_to = "cat_nombre_region", values_to = "count") %>%
        dplyr::group_by(amelia_fit_imputations_imp1_row) %>% 
        dplyr::mutate(nombre_region_imputed_max=max(count,na.rm=T)) %>% 
        dplyr::ungroup() %>% 
        dplyr::filter(nombre_region_imputed_max==count) %>% 
        dplyr::select(amelia_fit_imputations_imp1_row,cat_nombre_region,count) %>% 
        dplyr::group_by(amelia_fit_imputations_imp1_row) %>% 
        dplyr::mutate(n_row=n()) %>% 
        dplyr::ungroup() %>% 
        dplyr::mutate(cat_nombre_region=dplyr::case_when(n_row>1~NA_character_,
                                                        TRUE~cat_nombre_region)) %>% 
        dplyr::distinct(amelia_fit_imputations_imp1_row,.keep_all = T)
  
nombre_region_imputed<-
  nombre_region_imputed %>% 
    dplyr::left_join(nombre_region_imputed_cat_reg, by="amelia_fit_imputations_imp1_row") %>%
    dplyr::mutate(cat_nombre_region=dplyr::case_when(cat_nombre_region=="nombre_region_01"~"Tarapacá (01)",cat_nombre_region=="nombre_region_02"~"Antofagasta (02)",cat_nombre_region=="nombre_region_03"~"Atacama (03)",cat_nombre_region=="nombre_region_04"~"Coquimbo (04)",cat_nombre_region=="nombre_region_05"~"Valparaíso (05)",cat_nombre_region=="nombre_region_06"~"O'Higgins (06)",cat_nombre_region=="nombre_region_07"~"Maule (07)",cat_nombre_region=="nombre_region_08"~"Biobío (08)",cat_nombre_region=="nombre_region_09"~"Araucanía (09)",cat_nombre_region=="nombre_region_10"~"Los Lagos (10)",cat_nombre_region=="nombre_region_11"~"Aysén (11)",cat_nombre_region=="nombre_region_12"~"Magallanes (12)",cat_nombre_region=="nombre_region_13"~"Metropolitana (13)",
                                                 cat_nombre_region=="nombre_region_14"~"Los Ríos (14)",cat_nombre_region=="nombre_region_15"~"Arica (15)",cat_nombre_region=="nombre_region_16"~"Ñuble (16)",
    ))%>% 
  janitor::clean_names()

#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_
tipo_centro_pub_imputed<-
 cbind.data.frame(amelia_fit$imputations$imp1$row,
       amelia_fit$imputations$imp1$tipo_centro_pub,
       amelia_fit$imputations$imp2$tipo_centro_pub,
       amelia_fit$imputations$imp3$tipo_centro_pub,
       amelia_fit$imputations$imp4$tipo_centro_pub,
       amelia_fit$imputations$imp5$tipo_centro_pub,
       amelia_fit$imputations$imp6$tipo_centro_pub,
       amelia_fit$imputations$imp7$tipo_centro_pub,
       amelia_fit$imputations$imp8$tipo_centro_pub,
       amelia_fit$imputations$imp9$tipo_centro_pub,
       amelia_fit$imputations$imp10$tipo_centro_pub,
       amelia_fit$imputations$imp11$tipo_centro_pub,
       amelia_fit$imputations$imp12$tipo_centro_pub,
       amelia_fit$imputations$imp13$tipo_centro_pub,
       amelia_fit$imputations$imp14$tipo_centro_pub,
       amelia_fit$imputations$imp15$tipo_centro_pub,
       amelia_fit$imputations$imp16$tipo_centro_pub,
       amelia_fit$imputations$imp17$tipo_centro_pub,
       amelia_fit$imputations$imp18$tipo_centro_pub,
       amelia_fit$imputations$imp19$tipo_centro_pub,
       amelia_fit$imputations$imp20$tipo_centro_pub,
       amelia_fit$imputations$imp21$tipo_centro_pub,
       amelia_fit$imputations$imp22$tipo_centro_pub,
       amelia_fit$imputations$imp23$tipo_centro_pub,
       amelia_fit$imputations$imp24$tipo_centro_pub,
       amelia_fit$imputations$imp25$tipo_centro_pub,
       amelia_fit$imputations$imp26$tipo_centro_pub,
       amelia_fit$imputations$imp27$tipo_centro_pub,
       amelia_fit$imputations$imp28$tipo_centro_pub,
       amelia_fit$imputations$imp29$tipo_centro_pub,
       amelia_fit$imputations$imp30$tipo_centro_pub
       ) %>% 
  melt(id.vars="amelia_fit$imputations$imp1$row") %>% 
  janitor::clean_names() %>% 
  dplyr::filter(value==TRUE) %>% 
  dplyr::group_by(amelia_fit_imputations_imp1_row) %>% 
  dplyr::summarise(tipo_centro_pub_to_imputation=ifelse(n()>15,1,0))

#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:

CONS_C1_df_dup_SEP_2020_match_miss6<-
CONS_C1_df_dup_SEP_2020_match_miss5 %>% 
   dplyr::left_join(dplyr::select(nombre_region_imputed,amelia_fit_imputations_imp1_row,cat_nombre_region), by=c("row"="amelia_fit_imputations_imp1_row")) %>% 
    dplyr::mutate(nombre_region=factor(dplyr::case_when(is.na(nombre_region)~as.character(cat_nombre_region),TRUE~as.character(nombre_region)))) %>% 
  dplyr::left_join(dplyr::select(tipo_centro_pub_imputed,amelia_fit_imputations_imp1_row,tipo_centro_pub_to_imputation), by=c("row"="amelia_fit_imputations_imp1_row")) %>% 
  dplyr::mutate(tipo_centro_pub=factor(dplyr::case_when(is.na(tipo_centro_pub)~as.logical(tipo_centro_pub_to_imputation),TRUE~as.logical(tipo_centro_pub)))) %>%
  dplyr::select(-c(cat_est_conyugal,cat_nombre_region,tipo_centro_pub_to_imputation,tipo_centro_pub_to_imputation)) %>% 
  data.table()
#CONS_C1_df_dup_SEP_2020_match_miss6
#table(is.na(CONS_C1_df_dup_SEP_2020_match_miss6$tipo_centro_pub))
#table(is.na(CONS_C1_df_dup_SEP_2020_match_miss6$nombre_region))


There were impossible to impute region of the center in 11 cases due to ties in the different imputed values. In case of public or private center, there were no missing values once imputed.


Diagnose of Drug Consumption

We looked over possible imputations to the diagnosis of drug consumption (n=1).


# Ver distintos valores propuestos para estado conyugal
#evaluacindelprocesoteraputico nombre_region tipo_centro_pub

dg_trs_cons_sus_or_imputed<-
 cbind.data.frame(amelia_fit$imputations$imp1$row,
       amelia_fit$imputations$imp1$dg_trs_cons_sus_or,
       amelia_fit$imputations$imp2$dg_trs_cons_sus_or,
       amelia_fit$imputations$imp3$dg_trs_cons_sus_or,
       amelia_fit$imputations$imp4$dg_trs_cons_sus_or,
       amelia_fit$imputations$imp5$dg_trs_cons_sus_or,
       amelia_fit$imputations$imp6$dg_trs_cons_sus_or,
       amelia_fit$imputations$imp7$dg_trs_cons_sus_or,
       amelia_fit$imputations$imp8$dg_trs_cons_sus_or,
       amelia_fit$imputations$imp9$dg_trs_cons_sus_or,
       amelia_fit$imputations$imp10$dg_trs_cons_sus_or,
       amelia_fit$imputations$imp11$dg_trs_cons_sus_or,
       amelia_fit$imputations$imp12$dg_trs_cons_sus_or,
       amelia_fit$imputations$imp13$dg_trs_cons_sus_or,
       amelia_fit$imputations$imp14$dg_trs_cons_sus_or,
       amelia_fit$imputations$imp15$dg_trs_cons_sus_or,
       amelia_fit$imputations$imp16$dg_trs_cons_sus_or,
       amelia_fit$imputations$imp17$dg_trs_cons_sus_or,
       amelia_fit$imputations$imp18$dg_trs_cons_sus_or,
       amelia_fit$imputations$imp19$dg_trs_cons_sus_or,
       amelia_fit$imputations$imp20$dg_trs_cons_sus_or,
       amelia_fit$imputations$imp21$dg_trs_cons_sus_or,
       amelia_fit$imputations$imp22$dg_trs_cons_sus_or,
       amelia_fit$imputations$imp23$dg_trs_cons_sus_or,
       amelia_fit$imputations$imp24$dg_trs_cons_sus_or,
       amelia_fit$imputations$imp25$dg_trs_cons_sus_or,
       amelia_fit$imputations$imp26$dg_trs_cons_sus_or,
       amelia_fit$imputations$imp27$dg_trs_cons_sus_or,
       amelia_fit$imputations$imp28$dg_trs_cons_sus_or,
       amelia_fit$imputations$imp29$dg_trs_cons_sus_or,
       amelia_fit$imputations$imp30$dg_trs_cons_sus_or
       ) %>% 
  melt(id.vars="amelia_fit$imputations$imp1$row") %>% 
  janitor::clean_names() %>% 
  dplyr::filter(value==TRUE) %>% 
  dplyr::group_by(amelia_fit_imputations_imp1_row) %>% 
  dplyr::summarise(dg_trs_cons_imputation=ifelse(n()>15,1,0))

#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:

CONS_C1_df_dup_SEP_2020_match_miss7<-
CONS_C1_df_dup_SEP_2020_match_miss6 %>% 
    dplyr::left_join(dplyr::select(dg_trs_cons_sus_or_imputed,amelia_fit_imputations_imp1_row,dg_trs_cons_imputation), by=c("row"="amelia_fit_imputations_imp1_row")) %>% 
  dplyr::mutate(dg_trs_cons_sus_or=factor(dplyr::case_when(is.na(dg_trs_cons_sus_or)~as.logical(dg_trs_cons_imputation),TRUE~as.logical(dg_trs_cons_sus_or)))) %>%
  dplyr::select(-dg_trs_cons_imputation) %>% 
  data.table()
#CONS_C1_df_dup_SEP_2020_match_miss6
#table(is.na(CONS_C1_df_dup_SEP_2020_match_miss6$tipo_centro_pub))
#table(is.na(CONS_C1_df_dup_SEP_2020_match_miss6$nombre_region))


Cause of Discharge

We looked over possible imputations to the truly missing values, discarding missing values due to censorship (n=20).

motivo_de_egreso_a_imputar<-
CONS_C1_df_dup_SEP_2020_match_miss %>% dplyr::filter(is.na(motivodeegreso_mod_imp)) %>% dplyr::left_join(dplyr::select(CONS_C1_df_dup_SEP_2020,row,fech_egres_imp)) %>% dplyr::filter(!is.na(fech_egres_imp))%>%dplyr::select(row)

motivodeegreso_mod_imp_imputed<-
 cbind.data.frame(amelia_fit$imputations$imp1$row,
       amelia_fit$imputations$imp1$motivodeegreso_mod_imp,
       amelia_fit$imputations$imp2$motivodeegreso_mod_imp,
       amelia_fit$imputations$imp3$motivodeegreso_mod_imp,
       amelia_fit$imputations$imp4$motivodeegreso_mod_imp,
       amelia_fit$imputations$imp5$motivodeegreso_mod_imp,
       amelia_fit$imputations$imp6$motivodeegreso_mod_imp,
       amelia_fit$imputations$imp7$motivodeegreso_mod_imp,
       amelia_fit$imputations$imp8$motivodeegreso_mod_imp,
       amelia_fit$imputations$imp9$motivodeegreso_mod_imp,
       amelia_fit$imputations$imp10$motivodeegreso_mod_imp,
       amelia_fit$imputations$imp11$motivodeegreso_mod_imp,
       amelia_fit$imputations$imp12$motivodeegreso_mod_imp,
       amelia_fit$imputations$imp13$motivodeegreso_mod_imp,
       amelia_fit$imputations$imp14$motivodeegreso_mod_imp,
       amelia_fit$imputations$imp15$motivodeegreso_mod_imp,
       amelia_fit$imputations$imp16$motivodeegreso_mod_imp,
       amelia_fit$imputations$imp17$motivodeegreso_mod_imp,
       amelia_fit$imputations$imp18$motivodeegreso_mod_imp,
       amelia_fit$imputations$imp19$motivodeegreso_mod_imp,
       amelia_fit$imputations$imp20$motivodeegreso_mod_imp,
       amelia_fit$imputations$imp21$motivodeegreso_mod_imp,
       amelia_fit$imputations$imp22$motivodeegreso_mod_imp,
       amelia_fit$imputations$imp23$motivodeegreso_mod_imp,
       amelia_fit$imputations$imp24$motivodeegreso_mod_imp,
       amelia_fit$imputations$imp25$motivodeegreso_mod_imp,
       amelia_fit$imputations$imp26$motivodeegreso_mod_imp,
       amelia_fit$imputations$imp27$motivodeegreso_mod_imp,
       amelia_fit$imputations$imp28$motivodeegreso_mod_imp,
       amelia_fit$imputations$imp29$motivodeegreso_mod_imp,
       amelia_fit$imputations$imp30$motivodeegreso_mod_imp
       ) %>% 
  melt(id.vars="amelia_fit$imputations$imp1$row") %>% 
  janitor::clean_names() %>% 
  dplyr::arrange(amelia_fit_imputations_imp1_row) %>% 
  dplyr::ungroup() %>% 
  dplyr::filter(amelia_fit_imputations_imp1_row %in% unlist(motivo_de_egreso_a_imputar$row)) %>% 
  #FILTRAR CASOS QUE SON ILÓGICOS: MUERTES CON TRATAMIENTOS POSTERIORES (1)
  dplyr::left_join(dplyr::select(CONS_C1_df_dup_SEP_2020,row,motivodeegreso_mod_imp, fech_egres_imp,dup, duplicates_filtered,evaluacindelprocesoteraputico,fech_ing_next_treat),by=c("amelia_fit_imputations_imp1_row"="row")) %>% 
  dplyr::mutate(value_death=dplyr::case_when(value=="Death"& !is.na(fech_ing_next_treat)~1,TRUE~0)) %>% 
  dplyr::filter(value_death!=1) %>%  
  #:#:#:#:#:
  dplyr::count(amelia_fit_imputations_imp1_row,value) %>% 
  dplyr::group_by(amelia_fit_imputations_imp1_row) %>% 
  dplyr::slice_min(n, n = 1) %>% 
  dplyr::summarise(adm_dis=sum(value == "Administrative discharge",na.rm=T),
                    death=sum(value == "Death",na.rm=T),
                    referral=sum(value == "Referral to another treatment",na.rm=T),
                    ter_dis=sum(value == "Therapeutic discharge",na.rm=T),
                    dropout=sum(value =="Drop-out",na.rm=T)) %>% 
  rowwise() %>% 
  dplyr::mutate(ties=sum(c_across(adm_dis:dropout)),ties=ifelse(ties>1,1,0)) %>% 
  #dplyr::filter(ties==1) %>% 
  dplyr::ungroup() %>% 
  dplyr::left_join(dplyr::select(CONS_C1_df_dup_SEP_2020,row,motivodeegreso_mod_imp, fech_egres_imp,fech_egres_num,dup, duplicates_filtered,evaluacindelprocesoteraputico,tipo_centro_derivacion),by=c("amelia_fit_imputations_imp1_row"="row")) %>% 
  dplyr::mutate(motivodeegreso_mod_imp_imputation= dplyr::case_when(
    ties==0 & adm_dis==1 & fech_egres_imp<"2019-11-13"~"Administrative discharge",
    #its an absorbing state. should not have posterior treatments
    ties==0 & death==1 & fech_egres_imp<"2019-11-13" & dup==duplicates_filtered~"Death",
    ties==0 & referral==1 & fech_egres_imp<"2019-11-13"~"Referral to another treatment",
    ties==0 & ter_dis==1 & fech_egres_imp<"2019-11-13"~"Therapeutic discharge",
    ties==0 & dropout==1 & fech_egres_imp<"2019-11-13"~"Drop-out",
    #si no hay fecha de egreso, está en la fecha de censura, sólo puede ser tratamiento en curso
    fech_egres_imp>="2019-11-13"~NA_character_,
    TRUE~NA_character_)) %>% 
    #si tiene evaluacindelprocesoteraputico, es porque no es un tratamiento en curso
  dplyr::rename("motivodeegreso_mod_imp_original"="motivodeegreso_mod_imp")

#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:
CONS_C1_df_dup_SEP_2020_match_miss8<-
CONS_C1_df_dup_SEP_2020_match_miss7 %>% 
   dplyr::left_join(motivodeegreso_mod_imp_imputed[,c("amelia_fit_imputations_imp1_row","motivodeegreso_mod_imp_original","fech_egres_imp","fech_egres_num","motivodeegreso_mod_imp_imputation")], by=c("row"="amelia_fit_imputations_imp1_row")) %>%
  #dplyr::filter(is.na(motivodeegreso_mod_imp)) %>% dplyr::select(row,hash_key,motivodeegreso_mod_imp_original, motivodeegreso_mod_imp_imputation,motivodeegreso_mod_imp,fech_egres_num,fech_egres_imp)
      dplyr::mutate(motivodeegreso_mod_imp=factor(dplyr::case_when(is.na(motivodeegreso_mod_imp)~motivodeegreso_mod_imp_imputation,
                                                                   motivodeegreso_mod_imp_original=="Ongoing treatment"~NA_character_, TRUE~as.character(motivodeegreso_mod_imp)))) %>% 
  dplyr::select(-motivodeegreso_mod_imp_imputation,-fech_egres_imp,-fech_egres_num,-motivodeegreso_mod_imp_original) %>% 
  #dplyr::rename_all( list(~paste0(., ".left"))) %>% 
  dplyr::left_join(dplyr::select(CONS_C1_df_dup_SEP_2020,row,motivodeegreso_mod_imp) %>% 
                     dplyr::rename("motivodeegreso_mod_imp_original"="motivodeegreso_mod_imp"),by="row") %>%
  data.table()

# CONS_C1_df_dup_SEP_2020_match_miss8 %>% janitor::tabyl(motivodeegreso_mod_imp,motivodeegreso_mod_imp_original)
#CONS_C1_df_dup_SEP_2020_match_miss8 %>% janitor::tabyl(motivodeegreso_mod_imp_original)

#
if(
CONS_C1_df_dup_SEP_2020_match_miss8 %>% dplyr::filter(motivodeegreso_mod_imp_original!="Ongoing treatment",is.na(motivodeegreso_mod_imp)) %>% nrow()>0){"There are missing values on the cause of discharge"}


A total of 3 cases were not imputed due to ties in the imputed values.


Evaluation of the Therapeutic Process

Another variable that is worth imputing is the Evaluation of the Therapeutic Process at Discharge (n= 7,378). In case of ties, we selected the imputed values with the value with the minimum evaluation. Must consider that most of the null values could be explained by censoring or not completion of the treatment at the period of the study (n= 7,361).


# Ver distintos valores propuestos para sustancia de inciio
evaluacindelprocesoteraputico_imputed<-
 cbind.data.frame(amelia_fit$imputations$imp1$row,
       amelia_fit$imputations$imp1$evaluacindelprocesoteraputico,
       amelia_fit$imputations$imp2$evaluacindelprocesoteraputico,
       amelia_fit$imputations$imp3$evaluacindelprocesoteraputico,
       amelia_fit$imputations$imp4$evaluacindelprocesoteraputico,
       amelia_fit$imputations$imp5$evaluacindelprocesoteraputico,
       amelia_fit$imputations$imp6$evaluacindelprocesoteraputico,
       amelia_fit$imputations$imp7$evaluacindelprocesoteraputico,
       amelia_fit$imputations$imp8$evaluacindelprocesoteraputico,
       amelia_fit$imputations$imp9$evaluacindelprocesoteraputico,
       amelia_fit$imputations$imp10$evaluacindelprocesoteraputico,
       amelia_fit$imputations$imp11$evaluacindelprocesoteraputico,
       amelia_fit$imputations$imp12$evaluacindelprocesoteraputico,
       amelia_fit$imputations$imp13$evaluacindelprocesoteraputico,
       amelia_fit$imputations$imp14$evaluacindelprocesoteraputico,
       amelia_fit$imputations$imp15$evaluacindelprocesoteraputico,
       amelia_fit$imputations$imp16$evaluacindelprocesoteraputico,
       amelia_fit$imputations$imp17$evaluacindelprocesoteraputico,
       amelia_fit$imputations$imp18$evaluacindelprocesoteraputico,
       amelia_fit$imputations$imp19$evaluacindelprocesoteraputico,
       amelia_fit$imputations$imp20$evaluacindelprocesoteraputico,
       amelia_fit$imputations$imp21$evaluacindelprocesoteraputico,
       amelia_fit$imputations$imp22$evaluacindelprocesoteraputico,
       amelia_fit$imputations$imp23$evaluacindelprocesoteraputico,
       amelia_fit$imputations$imp24$evaluacindelprocesoteraputico,
       amelia_fit$imputations$imp25$evaluacindelprocesoteraputico,
       amelia_fit$imputations$imp26$evaluacindelprocesoteraputico,
       amelia_fit$imputations$imp27$evaluacindelprocesoteraputico,
       amelia_fit$imputations$imp28$evaluacindelprocesoteraputico,
       amelia_fit$imputations$imp29$evaluacindelprocesoteraputico,
       amelia_fit$imputations$imp30$evaluacindelprocesoteraputico
       ) %>% 
  melt(id.vars="amelia_fit$imputations$imp1$row") %>% 
  janitor::clean_names() %>% 
  dplyr::arrange(amelia_fit_imputations_imp1_row) %>% 
  dplyr::ungroup() %>% 
  dplyr::group_by(amelia_fit_imputations_imp1_row) %>%
  dplyr::summarise(high_ach_1=sum(value == "1-High Achievement",na.rm=T),
                   med_ach_2=sum(value == "2-Medium Achievement",na.rm=T),
                  min_ach_3=sum(value =="3-Minimum Achievement",na.rm=T)) %>% 
  dplyr::ungroup() %>% 
  dplyr::mutate(evaluacindelprocesoteraputico_imputation= dplyr::case_when(
      (high_ach_1 >med_ach_2) & (med_ach_2 >min_ach_3)~"1-High Achievement",
      (med_ach_2>high_ach_1) & (med_ach_2 >min_ach_3)~"2-Medium Achievement",
      (min_ach_3>med_ach_2) & (min_ach_3 >high_ach_1)~"3-Minimum Achievement"))

#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:
##
#CONS_C1_df_dup_SEP_2020 %>% janitor::tabyl(motivodeegreso_mod_imp,evaluacindelprocesoteraputico)

CONS_C1_df_dup_SEP_2020_match_miss9<-
CONS_C1_df_dup_SEP_2020_match_miss8 %>% 
   dplyr::left_join(evaluacindelprocesoteraputico_imputed[,c("amelia_fit_imputations_imp1_row","evaluacindelprocesoteraputico_imputation")], by=c("row"="amelia_fit_imputations_imp1_row")) %>%
    dplyr::mutate(evaluacindelprocesoteraputico=factor(dplyr::case_when(is.na(evaluacindelprocesoteraputico) & motivodeegreso_mod_imp %in% c("Drop-out","Administrative discharge","Therapeutic discharge","Referral to another treatment")~evaluacindelprocesoteraputico_imputation,
                                                                        is.na(motivodeegreso_mod_imp)~NA_character_,
                                                                        TRUE~as.character(evaluacindelprocesoteraputico)))) %>% 
     dplyr::mutate(evaluacindelprocesoteraputico=parse_factor(as.character(evaluacindelprocesoteraputico),levels=c('1-High Achievement', '2-Medium Achievement','3-Minimum Achievement'), ordered =T,trim_ws=T,include_na =F, locale=locale(encoding = "UTF-8"))) %>% 
  dplyr::select(-evaluacindelprocesoteraputico_imputation) %>% 
  data.table()

CONS_C1_df_dup_SEP_2020_match_miss9 %>% janitor::tabyl(motivodeegreso_mod_imp,evaluacindelprocesoteraputico) %>% 
    knitr::kable(.,format = "html", format.args = list(decimal.mark = ".", big.mark = ","),
               caption = paste0("Table 2. Cause of Discharge vs. Evaluation of the Therapeutic Procress"),
               col.names = c("Cause of Discharge","1-High Achievement", "2- Medium Achievement","3- Minimum Achievement","Null Values"),
               align =rep('c', 101)) %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"),font_size = 9) %>%
  kableExtra::scroll_box(width = "100%", height = "375px") 
Table 2. Cause of Discharge vs. Evaluation of the Therapeutic Procress
Cause of Discharge 1-High Achievement 2- Medium Achievement 3- Minimum Achievement Null Values
Administrative discharge 866 4,428 4,487 2
Death 0 0 1 0
Drop-out 1,767 16,839 37,301 0
Referral to another treatment 1,298 5,834 4,706 1
Therapeutic discharge 17,120 6,135 1,117 0
NA 0 0 0 7,854


As seen in the table above, ongoing treatments did not have an evaluation process, which is logically valid, since their treatment competition was not captured.


Treatment Setting (Residential)

We looked over possible imputations to the treatment setting (n=97).


# Ver distintos valores propuestos para estado conyugal
#evaluacindelprocesoteraputico nombre_region tipo_centro_pub

tipo_de_plan_res_imputed<-
 cbind.data.frame(amelia_fit$imputations$imp1$row,
       amelia_fit$imputations$imp1$tipo_de_plan_res,
       amelia_fit$imputations$imp2$tipo_de_plan_res,
       amelia_fit$imputations$imp3$tipo_de_plan_res,
       amelia_fit$imputations$imp4$tipo_de_plan_res,
       amelia_fit$imputations$imp5$tipo_de_plan_res,
       amelia_fit$imputations$imp6$tipo_de_plan_res,
       amelia_fit$imputations$imp7$tipo_de_plan_res,
       amelia_fit$imputations$imp8$tipo_de_plan_res,
       amelia_fit$imputations$imp9$tipo_de_plan_res,
       amelia_fit$imputations$imp10$tipo_de_plan_res,
       amelia_fit$imputations$imp11$tipo_de_plan_res,
       amelia_fit$imputations$imp12$tipo_de_plan_res,
       amelia_fit$imputations$imp13$tipo_de_plan_res,
       amelia_fit$imputations$imp14$tipo_de_plan_res,
       amelia_fit$imputations$imp15$tipo_de_plan_res,
       amelia_fit$imputations$imp16$tipo_de_plan_res,
       amelia_fit$imputations$imp17$tipo_de_plan_res,
       amelia_fit$imputations$imp18$tipo_de_plan_res,
       amelia_fit$imputations$imp19$tipo_de_plan_res,
       amelia_fit$imputations$imp20$tipo_de_plan_res,
       amelia_fit$imputations$imp21$tipo_de_plan_res,
       amelia_fit$imputations$imp22$tipo_de_plan_res,
       amelia_fit$imputations$imp23$tipo_de_plan_res,
       amelia_fit$imputations$imp24$tipo_de_plan_res,
       amelia_fit$imputations$imp25$tipo_de_plan_res,
       amelia_fit$imputations$imp26$tipo_de_plan_res,
       amelia_fit$imputations$imp27$tipo_de_plan_res,
       amelia_fit$imputations$imp28$tipo_de_plan_res,
       amelia_fit$imputations$imp29$tipo_de_plan_res,
       amelia_fit$imputations$imp30$tipo_de_plan_res
       ) %>% 
  melt(id.vars="amelia_fit$imputations$imp1$row") %>% 
  janitor::clean_names() %>% 
  dplyr::group_by(amelia_fit_imputations_imp1_row) %>% 
  dplyr::summarise(n_res=sum(value=="1",na.rm=T),n_amb=sum(value=="0",na.rm=T))

#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:

CONS_C1_df_dup_SEP_2020_match_miss10<-
CONS_C1_df_dup_SEP_2020_match_miss9 %>% 
    dplyr::left_join(dplyr::select(tipo_de_plan_res_imputed,amelia_fit_imputations_imp1_row,n_res,n_amb), by=c("row"="amelia_fit_imputations_imp1_row")) %>% 
  dplyr::mutate(tipo_de_plan_res=factor(dplyr::case_when(is.na(tipo_de_plan_res)& (n_res>n_amb)~"1",is.na(tipo_de_plan_res)& (n_res<n_amb)~"0",TRUE~as.character(tipo_de_plan_res)))) %>%
  dplyr::select(-n_res,-n_amb) %>% 
  data.table()
#CONS_C1_df_dup_SEP_2020_match_miss6
#table(is.na(CONS_C1_df_dup_SEP_2020_match_miss6$tipo_centro_pub))
#table(is.na(CONS_C1_df_dup_SEP_2020_match_miss6$nombre_region))

As a result of the process of imputation, some values were not possible to impute (n=97).


Sample Characteristics

We checked the characteristics of the sample depending on type of treatment (Residential or Outpatients).


#prop.table(table(CONS_C1_df_dup_SEP_2020_match$abandono_temprano_rec,CONS_C1_df_dup_SEP_2020_match$tipo_de_plan_res),2)
match.on_tot <- c("row", "hash_key","sus_ini_mod_mvv","estado_conyugal_2","escolaridad_rec","edad_ini_cons","freq_cons_sus_prin","origen_ingreso_mod","dg_cie_10_rec","nombre_region","tipo_centro_pub","abandono_temprano_rec","evaluacindelprocesoteraputico","motivodeegreso_mod_imp","dg_trs_cons_sus_or","tipo_de_plan_res","sexo_2","edad_al_ing","fech_ing_num")
#$109,756
#añado los imputados
CONS_C1_df_dup_SEP_2020_match_miss_after_imp<-
CONS_C1_df_dup_SEP_2020_match_miss %>% 
  dplyr::select(-sus_ini_mod_mvv,-estado_conyugal_2,-escolaridad_rec,-freq_cons_sus_prin,-nombre_region,-tipo_centro_pub,-evaluacindelprocesoteraputico,-motivodeegreso_mod_imp,-dg_trs_cons_sus_or,-tipo_de_plan_res,-edad_ini_cons,-via_adm_sus_prin_act) %>% #
  dplyr::left_join(dplyr::select(CONS_C1_df_dup_SEP_2020_match_miss10,
                                 row,
                                 sus_ini_mod_mvv,
                                 estado_conyugal_2,
                                 escolaridad_rec,
                                 freq_cons_sus_prin,
                                 nombre_region,
                                 tipo_centro_pub,
                                 evaluacindelprocesoteraputico,
                                 motivodeegreso_mod_imp,
                                 dg_trs_cons_sus_or,
                                 tipo_de_plan_res,
                                 edad_ini_cons,rn),by="row") %>% 
  dplyr::arrange(tipo_de_plan_res,hash_key,rn) %>% 
  #elimino esta variable porque es accesoria
  dplyr::select(-edad_ini_sus_prin) %>% 
  #para transformar el motivo de egreso
  dplyr::left_join(dplyr::select(CONS_C1_df_dup_SEP_2020,row,fech_egres_num,dias_treat_imp_sin_na),by="row") %>%
  #dplyr::filter(fech_egres_num==18213,!is.na(motivodeegreso_mod_imp)) %>% 
  dplyr::mutate(motivodeegreso_mod_imp=dplyr::case_when(dias_treat_imp_sin_na>=90 & motivodeegreso_mod_imp=="Drop-out"~ "Late Drop-out",
                                                        dias_treat_imp_sin_na<90 & motivodeegreso_mod_imp=="Drop-out"~ "Early Drop-out",
                                                        fech_egres_num==18213 & is.na(motivodeegreso_mod_imp)~"Ongoing treatment",
                                                        TRUE~as.character(motivodeegreso_mod_imp)
                                                        )) %>% #janitor::tabyl(motivodeegreso_mod_imp)
  dplyr::mutate(evaluacindelprocesoteraputico2=dplyr::case_when(fech_egres_num==18213 & is.na(evaluacindelprocesoteraputico)~"Ongoing treatment",
                                                        TRUE~as.character(evaluacindelprocesoteraputico)
  )) %>% 
  dplyr::mutate(sum_miss = base::rowSums(is.na(dplyr::select(.,c("sus_ini_mod_mvv","estado_conyugal_2","escolaridad_rec","freq_cons_sus_prin","nombre_region","tipo_centro_pub","evaluacindelprocesoteraputico2","motivodeegreso_mod_imp","dg_trs_cons_sus_or","tipo_de_plan_res","edad_ini_cons","sexo_2","edad_al_ing","fech_ing_num"))))) %>% 
  dplyr::group_by(hash_key) %>% 
  dplyr::mutate(sum_miss=sum(sum_miss)) %>% 
  dplyr::ungroup() 

CONS_C1_df_dup_SEP_2020_match_miss_after_imp_descartados <-
  CONS_C1_df_dup_SEP_2020_match_miss_after_imp %>% 
  dplyr::filter(sum_miss>0)

CONS_C1_df_dup_SEP_2020_match_miss_after_imp_conservados <-
  CONS_C1_df_dup_SEP_2020_match_miss_after_imp %>% 
  dplyr::filter(sum_miss==0) %>% 
  dplyr::select(-sum_miss) %>% 
  dplyr::left_join(CONS_C1_df_dup_SEP_2020[c("row","condicion_ocupacional_corr")], by="row") %>% 
  dplyr::select(-evaluacindelprocesoteraputico2)

#  CONS_C1_df_dup_SEP_2020_match_miss_after_imp_conservados[complete.cases(CONS_C1_df_dup_SEP_2020_match_miss_after_imp_conservados[,..match.on_tot]),..match.on_tot] 


Considering that some missing values were not able to imputation (due to ties in the candidate values for imputation or inconsistent values for imputations) (377, users=293), we ended the process having 109,379 complete cases (users=84,755).


kableone <- function(x, caption=NULL, col.names=NA, smd=T, test=T, varLabels=T, noSpaces=T, printToggle=T, dropEqual=F, ...) {
  capture.output(x <- print(x, smd=T, test=test, varLabels=varLabels,noSpaces=noSpaces, printToggle=printToggle, dropEqual=dropEqual, ...))
  
  knitr::kable(x,format= "html", format.args= list(decimal.mark= ".", big.mark= ","),
               caption= caption, col.names= col.names)
}

match.on.sel<-c("sus_ini_mod_mvv","estado_conyugal_2","escolaridad_rec","edad_ini_cons","freq_cons_sus_prin","origen_ingreso_mod","dg_cie_10_rec","nombre_region","dg_trs_cons_sus_or", "tipo_centro_pub","sexo_2","edad_al_ing","fech_ing_num","condicion_ocupacional_corr")
catVars<-
c("sus_ini_mod_mvv","estado_conyugal_2","escolaridad_rec","tipo_centro_pub","freq_cons_sus_prin","origen_ingreso_mod","dg_cie_10_rec","dg_trs_cons_sus_or","nombre_region","tipo_de_plan_res","sexo_2","condicion_ocupacional_corr")
#length(unique(CONS_C1_df_dup_SEP_2020_match$fech_ing_num))
#:#:#:#:#: DISMINUIR LA HETEROGENEIDAD DE LA FECHA DE INGRESO
# FORMAS DE CONSTREÑIR LA VARIABLE:
#CONS_C1_df_dup_SEP_2020_match$fech_ing_num<-round(CONS_C1_df_dup_SEP_2020_match$fech_ing_num/10,0)
#CONS_C1_df_dup_SEP_2020_match$fech_ing_num<-cut(CONS_C1_df_dup_SEP_2020_match$fech_ing_num,100)
#CONS_C1_df_dup_SEP_2020_match$fech_ing_num<-CONS_C1_df_dup_SEP_2020_match_fech_ing_num
#CONS_C1_df_dup_SEP_2020_match_fech_ing_num<-CONS_C1_df_dup_SEP_2020_match$fech_ing_num
#length(unique(round(CONS_C1_df_dup_SEP_2020_match$fech_ing_num,0)))
#length(unique(round(CONS_C1_df_dup_SEP_2020_match$fech_ing_num/10,0)))

#CONS_C1_df_dup_SEP_2020_match$fech_ing_num<-round(CONS_C1_df_dup_SEP_2020_match$fech_ing_num/10,0)
#:#:#:#:#: 

paste0("Inconsistencies in dup vs. rn: ",CONS_C1_df_dup_SEP_2020_match_miss_after_imp_conservados%>% 
         dplyr::filter(dup!=rn) %>% nrow())
## [1] "Inconsistencies in dup vs. rn: 0"
CONS_C1_df_dup_SEP_2020_match_not_miss2 <-
  CONS_C1_df_dup_SEP_2020_match_miss_after_imp_conservados %>% 
  dplyr::filter(dup==1) %>% 
  dplyr::select(-rn,-dias_treat_imp_sin_na,-fech_egres_num)

attr(CONS_C1_df_dup_SEP_2020_match_not_miss2$sus_ini_mod_mvv,"label")<-"Starting Substance"
attr(CONS_C1_df_dup_SEP_2020_match_not_miss2$estado_conyugal_2,"label")<-"Marital Status"
attr(CONS_C1_df_dup_SEP_2020_match_not_miss2$escolaridad_rec,"label")<-"Educational Attainment"
attr(CONS_C1_df_dup_SEP_2020_match_not_miss2$edad_ini_cons,"label")<-"Age of Onset of Drug Use"
attr(CONS_C1_df_dup_SEP_2020_match_not_miss2$freq_cons_sus_prin,"label")<-"Frequency of use of primary drug"
attr(CONS_C1_df_dup_SEP_2020_match_not_miss2$nombre_region,"label")<-"Region of the Center"
attr(CONS_C1_df_dup_SEP_2020_match_not_miss2$dg_cie_10_rec,"label")<-"Psychiatric Comorbidity"
attr(CONS_C1_df_dup_SEP_2020_match_not_miss2$dg_trs_cons_sus_or,"label")<-"Drug Dependence"
attr(CONS_C1_df_dup_SEP_2020_match_not_miss2$evaluacindelprocesoteraputico,"label")<-"Evaluation of the Therapeutic Process"
attr(CONS_C1_df_dup_SEP_2020_match_not_miss2$abandono_temprano_rec,"label")<-"Early Discharge"
attr(CONS_C1_df_dup_SEP_2020_match_not_miss2$tipo_de_plan_res,"label")<-"Residential"
attr(CONS_C1_df_dup_SEP_2020_match_not_miss2$tipo_centro_pub,"label")<-"Public Center"
attr(CONS_C1_df_dup_SEP_2020_match_not_miss2$condicion_ocupacional_corr,"label")<-"Occupational Status"

pre_tab1<-Sys.time()
tab1<-
CreateTableOne(vars = match.on.sel, strata = "tipo_de_plan_res", 
                       data = CONS_C1_df_dup_SEP_2020_match_not_miss2, factorVars = catVars, smd=T)
post_tab1<-Sys.time()
diff_time_tab1=post_tab1-pre_tab1

kableone(tab1, 
         caption = paste0("Table 5. Covariate Balance in the Variables of Interest"),
         col.names= c("Ambulatory","Residential", "p-values","test","SMD"),
         nonnormal= c("edad_ini_cons","edad_al_ing","fech_ing_num"),#"\\hline",
                       smd=T, test=T, varLabels=T,noSpaces=T, printToggle=T, dropEqual=F) %>% 
    kableExtra::kable_styling(bootstrap_options = c("striped", "hover","condensed"),font_size= 10) %>%
  #()
  row_spec(1, bold = T, italic =T,color ="black",hline_after=T,extra_latex_after="\\arrayrulecolor{white}",font_size= 10) %>%
  #footnote(general = "Here is a general comments of the table. ",
  #        number = c("Footnote 1; ", "Footnote 2; "),
  #         alphabet = c("Footnote A; ", "Footnote B; "),
  #         symbol = c("Footnote Symbol 1; ", "Footnote Symbol 2")
  #         )%>%
  scroll_box(width = "100%", height = "400px") 
Table 5. Covariate Balance in the Variables of Interest
Ambulatory Residential p-values test SMD
n 72066 12689
Starting Substance (%) <0.001 0.368
Alcohol 41397 (57.4) 5073 (40.0)
Cocaine hydrochloride 2898 (4.0) 521 (4.1)
Cocaine paste 7688 (10.7) 2221 (17.5)
Marijuana 18423 (25.6) 4557 (35.9)
Other 1660 (2.3) 317 (2.5)
Marital Status (%) <0.001 0.309
Married/Shared living arrangements 26158 (36.3) 2910 (22.9)
Separated/Divorced 7706 (10.7) 1317 (10.4)
Single 37338 (51.8) 8331 (65.7)
Widower 864 (1.2) 131 (1.0)
Educational Attainment (%) <0.001 0.125
3-Completed primary school or less 21848 (30.3) 4572 (36.0)
2-Completed high school or less 37209 (51.6) 6134 (48.3)
1-More than high school 13009 (18.1) 1983 (15.6)
Age of Onset of Drug Use (median [IQR]) 15.00 [14.00, 18.00] 15.00 [13.00, 17.00] <0.001 nonnorm 0.090
Frequency of use of primary drug (%) <0.001 0.767
1 day a week or more 5325 (7.4) 271 (2.1)
2 to 3 days a week 22315 (31.0) 1326 (10.4)
4 to 6 days a week 12216 (17.0) 1648 (13.0)
Daily 28256 (39.2) 9228 (72.7)
Did not use 1096 (1.5) 83 (0.7)
Less than 1 day a week 2858 (4.0) 133 (1.0)
Origen de Ingreso (Primera Entrada)/Motive of Admission to Treatment (First Entry) (%) <0.001 0.510
Spontaneous 33633 (46.7) 4272 (33.7)
Assisted Referral 4933 (6.8) 3005 (23.7)
Other 3756 (5.2) 737 (5.8)
Justice Sector 7136 (9.9) 809 (6.4)
Health Sector 22608 (31.4) 3866 (30.5)
Psychiatric Comorbidity (%) <0.001 0.318
Without psychiatric comorbidity 29006 (40.2) 3244 (25.6)
Diagnosis unknown (under study) 13264 (18.4) 2769 (21.8)
With psychiatric comorbidity 29796 (41.3) 6676 (52.6)
Region of the Center (%) <0.001 0.388
Antofagasta (02) 2292 (3.2) 697 (5.5)
Araucanía (09) 2219 (3.1) 161 (1.3)
Arica (15) 1313 (1.8) 728 (5.7)
Atacama (03) 1830 (2.5) 259 (2.0)
Aysén (11) 797 (1.1) 42 (0.3)
Biobío (08) 5089 (7.1) 702 (5.5)
Coquimbo (04) 2800 (3.9) 269 (2.1)
Los Lagos (10) 2644 (3.7) 374 (2.9)
Los Ríos (14) 1111 (1.5) 185 (1.5)
Magallanes (12) 929 (1.3) 31 (0.2)
Maule (07) 4207 (5.8) 643 (5.1)
Metropolitana (13) 35957 (49.9) 6249 (49.2)
Ñuble (16) 539 (0.7) 20 (0.2)
O’Higgins (06) 3640 (5.1) 568 (4.5)
Tarapacá (01) 1347 (1.9) 597 (4.7)
Valparaíso (05) 5352 (7.4) 1164 (9.2)
Drug Dependence = TRUE (%) 50001 (69.4) 11644 (91.8) <0.001 0.590
Public Center = TRUE (%) 57100 (79.2) 3615 (28.5) <0.001 1.183
Sexo Usuario/Sex of User = Women (%) 17391 (24.1) 3936 (31.0) <0.001 0.155
Edad a la Fecha de Ingreso a Tratamiento (numérico continuo) (Primera Entrada)/Age at Admission to Treatment (First Entry) (median [IQR]) 34.43 [27.55, 43.45] 32.62 [26.33, 40.85] <0.001 nonnorm 0.185
Fecha de Ingreso a Tratamiento (Numérico)(c)/Date of Admission to Treatment (Numeric)(c) (median [IQR]) 16579.00 [15730.00, 17359.00] 16154.00 [15342.00, 17023.00] <0.001 nonnorm 0.292
Occupational Status (%) <0.001 1.026
Employed 39516 (54.8) 1769 (13.9)
Inactive 7668 (10.6) 1192 (9.4)
Looking for a job for the first time 172 (0.2) 20 (0.2)
No activity 2662 (3.7) 1823 (14.4)
Not seeking for work 492 (0.7) 334 (2.6)
Unemployed 21556 (29.9) 7551 (59.5)
#"tipo_de_plan_ambulatorio",
#https://cran.r-project.org/web/packages/tableone/vignettes/smd.html
#http://rstudio-pubs-static.s3.amazonaws.com/405765_2ce448f9bde24148a5f94c535a34b70e.html
#https://cran.r-project.org/web/packages/tableone/vignettes/introduction.html
#https://cran.r-project.org/web/packages/tableone/tableone.pdf
#https://www.rdocumentation.org/packages/tableone/versions/0.12.0/topics/CreateTableOne

## Construct a table 
#standardized mean differences of greater than 0.1


We checked the similarity in the samples using other measures, such as the variance ratio of the samples and Kolmogorov-Smirnov(KS) statistics.


library(cobalt)

bal2<-bal.tab(CONS_C1_df_dup_SEP_2020_match_not_miss2[,match.on.sel], treat = CONS_C1_df_dup_SEP_2020_match_not_miss2$tipo_de_plan_res,
         thresholds = c(m = .1, v = 2),
         binary = "std", 
         continuous = "std",
         stats = c("mean.diffs", "variance.ratios","ks.statistics"))
#"mean.diffs", "variance.ratios","ks.statistics","ovl.coefficient"

options(knitr.kable.NA = '')

bal2$Balance[,2]<-round(bal2$Balance[,2],2)
bal2$Balance[,4]<-round(bal2$Balance[,4],2)
bal2$Balance[,6]<-round(bal2$Balance[,6],2)

var_names<- 
    list("origen_ingreso_mod_Spontaneous"="Motive Admission-Spontaneous",
         "origen_ingreso_mod_Assisted Referral"= "Motive Admission-Assisted Referral",
         "origen_ingreso_mod_Other"="Motive Admission-Other",
         "origen_ingreso_mod_Justice Sector"= "Motive Admission-Justice Sector",
         "origen_ingreso_mod_Health Sector"="Motive Admission-Health Sector",
         "dg_cie_10_rec_Without psychiatric comorbidity"="ICD-10-Wo/Psych Comorbidity",
         "dg_cie_10_rec_Diagnosis unknown (under study)"="ICD-10-Dg. Unknown/under study",
         "dg_cie_10_rec_With psychiatric comorbidity"="ICD-10-W/Psych Comorbidity",
         "sexo_2_Women"="Sex-Women",
         "edad_al_ing"="Age at Admission",
         "fech_ing_num"="Date of Admission",
         "duplicates_filtered"="Treatments (#)",
         "more_one_treat"=">1 treatment",
         "sus_ini_mod_mvv_Alcohol"= "Starting Substance-Alcohol",
         "sus_ini_mod_mvv_Cocaine hydrochloride"= "Starting Substance-Cocaine hydrochloride",
         "sus_ini_mod_mvv_Cocaine paste"="Starting Substance-Cocaine paste",
         "sus_ini_mod_mvv_Marijuana"="Starting Substance-Marijuana",
         "sus_ini_mod_mvv_Other"="Starting Substance-Other",
         "estado_conyugal_2_Married/Shared living arrangements"="Marital Status-Married/Shared liv. arr.",
         "condicion_ocupacional_corr_Employed"="Occ.Status-Employed",
         "condicion_ocupacional_corr_Inactive"="Occ.Status-Inactive",
         "condicion_ocupacional_corr_Looking for a job for the first time"="Occ.Status-Looking 1st job",
         "condicion_ocupacional_corr_No activity"="Occ.Status- No activity",
         "condicion_ocupacional_corr_Not seeking for work"="Occ.Status- Not seeking work",
         "condicion_ocupacional_corr_Unemployed"="Occ.Status- Unemployed",
         "estado_conyugal_2_Separated/Divorced"="Marital Status-Separated/Divorced",
         "estado_conyugal_2_Single"= "Marital Status-Single",
         "estado_conyugal_2_Widower"="Marital Status-Widower",
         "escolaridad_rec_3-Completed primary school or less"="Educational Attainment-PS or less",
         "escolaridad_rec_2-Completed high school or less"="Educational Attainment-HS or less",
         "escolaridad_rec_1-More than high school"="Educational Attainment-More than HS",
         "freq_cons_sus_prin_1 day a week or more"="Freq Drug Cons-1d/wk or more",
         "freq_cons_sus_prin_2 to 3 days a week"="Freq Drug Cons-2-3d/wk",
         "freq_cons_sus_prin_4 to 6 days a week"="Freq Drug Cons-4-6d/wk",
         "freq_cons_sus_prin_Daily"="Freq Drug Cons-Daily",
         "freq_cons_sus_prin_Did not use"="Freq Drug Cons-Did not use",
         "freq_cons_sus_prin_Less than 1 day a week"="Freq Drug Cons-Less 1d/wk",
         "nombre_region_Antofagasta (02)"="Region-Antofagasta(02)",
         "nombre_region_Araucanía (09)"="Region-Araucanía(09)",
         "nombre_region_Arica (15)"="Region-Arica(15)",
         "nombre_region_Atacama (03)"="Region-Atacama(03)",
         "nombre_region_Aysén (11)"="Region-Aysén(11)",
         "nombre_region_Biobío (08)"="Region- Biobío(08)",
         "nombre_region_Coquimbo (04)"="Region-Coquimbo(04)",
         "nombre_region_Los Lagos (10)"="Region-Los Lagos(10)",
         "nombre_region_Los Ríos (14)"="Region-Los Ríos(14)",
         "nombre_region_Magallanes (12)"="Region-Magallanes(12)",
         "nombre_region_Maule (07)"="Region-Maule(07)",
         "nombre_region_Metropolitana (13)"="Region-Metropolitana(13)",
         "nombre_region_Ñuble (16)"="Region-Ñuble(16)",
         "nombre_region_O'Higgins (06)"="Region-O'Higgins(06)",
         "nombre_region_Tarapacá (01)"="Region-Tarapacá(01)",
         "nombre_region_Valparaíso (05)"="Region-Valparaíso(05)",
         "tipo_centro_pub"="Public Center",
         "dg_trs_cons_sus_or"= "Drug Dependence",
         "edad_ini_cons"="Age of Onset of Drug Use",
         "rn"="Treatment")

var.names<-data.table(data.frame(unlist(var_names)),keep.rownames = T) %>% janitor::clean_names()

balance_prev<-
data.table::data.table(bal2$Balance[,1:6],keep.rownames = T) %>%
  dplyr::arrange(-abs(Diff.Un)) %>% 
  dplyr::left_join(var.names,by="rn") %>% 
  dplyr::select(unlist_var_names,everything()) %>% 
  dplyr::select(-rn) 

balance_prev %>% #data.table::data.table(keep.rownames = F)
    knitr::kable(.,format = "html", format.args = list(decimal.mark = ".", big.mark = ","),
               caption = paste0("Table 4. Covariate Balance in the Variables of Interest"),
               col.names = c("Variables","Nature of Variables", "Unadjusted SMDs","Threshold","Unadjusted Variance Ratios","Threshold","Unadjusted KS"),
               align =rep('c', 101)) %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"),font_size = 10) %>%
  kableExtra::add_footnote( c(paste("Note. ")), 
                            notation = "none") %>%
  kableExtra::scroll_box(width = "100%", height = "375px")
Table 4. Covariate Balance in the Variables of Interest
Variables Nature of Variables Unadjusted SMDs Threshold Unadjusted Variance Ratios Threshold Unadjusted KS
Public Center Binary -1.18 Not Balanced, >0.1 0.51
Occ.Status-Employed Binary -0.95 Not Balanced, >0.1 0.41
Freq Drug Cons-Daily Binary 0.72 Not Balanced, >0.1 0.34
Occ.Status- Unemployed Binary 0.62 Not Balanced, >0.1 0.30
Drug Dependence Binary 0.59 Not Balanced, >0.1 0.22
Freq Drug Cons-2-3d/wk Binary -0.52 Not Balanced, >0.1 0.21
Motive Admission-Assisted Referral Binary 0.48 Not Balanced, >0.1 0.17
Occ.Status- No activity Binary 0.38 Not Balanced, >0.1 0.11
Starting Substance-Alcohol Binary -0.35 Not Balanced, >0.1 0.17
ICD-10-Wo/Psych Comorbidity Binary -0.32 Not Balanced, >0.1 0.15
Marital Status-Married/Shared liv. arr. Binary -0.30 Not Balanced, >0.1 0.13
Date of Admission Contin. -0.29 Not Balanced, >0.1 1.00 Balanced, <2 0.14
Marital Status-Single Binary 0.28 Not Balanced, >0.1 0.14
Motive Admission-Spontaneous Binary -0.27 Not Balanced, >0.1 0.13
Freq Drug Cons-1d/wk or more Binary -0.25 Not Balanced, >0.1 0.05
Starting Substance-Marijuana Binary 0.23 Not Balanced, >0.1 0.10
ICD-10-W/Psych Comorbidity Binary 0.23 Not Balanced, >0.1 0.11
Region-Arica(15) Binary 0.21 Not Balanced, >0.1 0.04
Starting Substance-Cocaine paste Binary 0.20 Not Balanced, >0.1 0.07
Freq Drug Cons-Less 1d/wk Binary -0.19 Not Balanced, >0.1 0.03
Age at Admission Contin. -0.19 Not Balanced, >0.1 0.84 Balanced, <2 0.07
Region-Tarapacá(01) Binary 0.16 Not Balanced, >0.1 0.03
Sex-Women Binary 0.15 Not Balanced, >0.1 0.07
Occ.Status- Not seeking work Binary 0.15 Not Balanced, >0.1 0.02
Motive Admission-Justice Sector Binary -0.13 Not Balanced, >0.1 0.04
Educational Attainment-PS or less Binary 0.12 Not Balanced, >0.1 0.06
Region-Araucanía(09) Binary -0.12 Not Balanced, >0.1 0.02
Region-Magallanes(12) Binary -0.12 Not Balanced, >0.1 0.01
Freq Drug Cons-4-6d/wk Binary -0.11 Not Balanced, >0.1 0.04
Region-Antofagasta(02) Binary 0.11 Not Balanced, >0.1 0.02
Region-Coquimbo(04) Binary -0.10 Not Balanced, >0.1 0.02
Age of Onset of Drug Use Contin. -0.09 Balanced, <0.1 0.91 Balanced, <2 0.07
ICD-10-Dg. Unknown/under study Binary 0.09 Balanced, <0.1 0.03
Region-Aysén(11) Binary -0.09 Balanced, <0.1 0.01
Region-Ñuble(16) Binary -0.09 Balanced, <0.1 0.01
Freq Drug Cons-Did not use Binary -0.08 Balanced, <0.1 0.01
Educational Attainment-HS or less Binary -0.07 Balanced, <0.1 0.03
Educational Attainment-More than HS Binary -0.06 Balanced, <0.1 0.02
Region- Biobío(08) Binary -0.06 Balanced, <0.1 0.02
Region-Valparaíso(05) Binary 0.06 Balanced, <0.1 0.02
Region-Los Lagos(10) Binary -0.04 Balanced, <0.1 0.01
Occ.Status-Inactive Binary -0.04 Balanced, <0.1 0.01
Motive Admission-Other Binary 0.03 Balanced, <0.1 0.01
Region-Atacama(03) Binary -0.03 Balanced, <0.1 0.00
Region-Maule(07) Binary -0.03 Balanced, <0.1 0.01
Region-O’Higgins(06) Binary -0.03 Balanced, <0.1 0.01
Marital Status-Widower Binary -0.02 Balanced, <0.1 0.00
Motive Admission-Health Sector Binary -0.02 Balanced, <0.1 0.01
Occ.Status-Looking 1st job Binary -0.02 Balanced, <0.1 0.00
Starting Substance-Other Binary 0.01 Balanced, <0.1 0.00
Marital Status-Separated/Divorced Binary -0.01 Balanced, <0.1 0.00
Region-Los Ríos(14) Binary -0.01 Balanced, <0.1 0.00
Region-Metropolitana(13) Binary -0.01 Balanced, <0.1 0.01
Starting Substance-Cocaine hydrochloride Binary 0.00 Balanced, <0.1 0.00
Note.


We generated a plot to focus on unbalanced data.


Figure 8. Covariates Balance on Different Values

Figure 8. Covariates Balance on Different Values

Specification

First, we had to discretize categorical variables into logical parameters, and for continuous covariates, we divide them into 20 equal parts.


catVars<-
c("sus_ini_mod_mvv","estado_conyugal_2","escolaridad_rec","tipo_centro_pub","freq_cons_sus_prin","origen_ingreso_mod","dg_cie_10_rec","dg_trs_cons_sus_or","nombre_region","tipo_de_plan_res","sexo_2","condicion_ocupacional_corr")
columna_dummy <- function(df, columna) {
  df %>% 
  mutate_at(columna, ~paste(columna, eval(as.symbol(columna)), sep = "_")) %>% 
    mutate(valor = 1) %>% 
    spread(key = columna, value = valor, fill = 0)
}

quantiles = function(covar, n_q) {
    p_q = seq(0, 1, 1/n_q)
    val_q = quantile(covar, probs = p_q, na.rm = TRUE)
    covar_out = rep(NA, length(covar))
    for (i in 1:n_q) {
        if (i==1) {covar_out[covar<val_q[i+1]] = i}
        if (i>1 & i<n_q) {covar_out[covar>=val_q[i] & covar<val_q[i+1]] = i}
        if (i==n_q) {covar_out[covar>=val_q[i] & covar<=val_q[i+1]] = i}}
    covar_out
}

CONS_C1_df_dup_SEP_2020_match_not_miss3<-CONS_C1_df_dup_SEP_2020_match_not_miss2
for (i in c(1:length(catVars))){#catVars[-10] excluding treatment indicator
  cat<-as.character(catVars[i])#catVars[-10] excluding treatment indicator
  CONS_C1_df_dup_SEP_2020_match_not_miss3<-columna_dummy(CONS_C1_df_dup_SEP_2020_match_not_miss3,cat)
}
CONS_C1_df_dup_SEP_2020_match_not_miss3$tipo_de_plan_res_FALSE<-NULL
CONS_C1_df_dup_SEP_2020_match_not_miss3$edad_ini_cons<-quantiles(CONS_C1_df_dup_SEP_2020_match_not_miss3$edad_ini_cons,20)
CONS_C1_df_dup_SEP_2020_match_not_miss3$edad_al_ing<-quantiles(CONS_C1_df_dup_SEP_2020_match_not_miss3$edad_al_ing,20)
CONS_C1_df_dup_SEP_2020_match_not_miss3$fech_ing_num<-quantiles(CONS_C1_df_dup_SEP_2020_match_not_miss3$fech_ing_num,20)
match.on.sel2<-names(CONS_C1_df_dup_SEP_2020_match_not_miss3)[-c(1,2,5)]
#"edad_ini_cons","edad_al_ing","fech_ing_num")

CONS_SEP_match = data.table::data.table(CONS_C1_df_dup_SEP_2020_match_not_miss2[order(CONS_C1_df_dup_SEP_2020_match_not_miss2$tipo_de_plan_res, decreasing = TRUE), ])

CONS_SEP_match_dum = data.table::data.table(CONS_C1_df_dup_SEP_2020_match_not_miss3 %>% dplyr::arrange(factor(row, levels = CONS_SEP_match$row)))


Match

The matched variables were defined for the treatments at baseline (n=84,755).


library(designmatch)

#fine = list(covs = fine_covs)
#solver = list(name = name, t_max = t_max, approximate = 1, round_cplex = 0, trace_cplex = 0).
#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:
# 1. Gurobi installation

#For an exact solution, we strongly recommend running designmatch either with CPLEX or Gurobi.  Between these two solvers, the R interface of Gurobi is considerably easier to install.  Here we provide general instructions for manually installing Gurobi and its R interface in Mac and Windows machines.

#1. Create a free academic license
#   Follow the instructions in: http://www.gurobi.com/documentation/7.0/quickstart_windows/creating_a_new_academic_li.html

#2. Install the software
#   2.1. In http://www.gurobi.com/index, go to Downloads > Gurobi Software
#   2.2. Choose your operating system and press download
#
#3. Retrieve and set up your Gurobi license
#   2.1. Follow the instructions in: http://www.gurobi.com/documentation/7.0/quickstart_windows/retrieving_and_setting_up_.html
#   2.2. Then follow the instructions in: http://www.gurobi.com/documentation/7.0/quickstart_windows/retrieving_a_free_academic.html
#
#4. Test your license
#   Follow the instructions in: http://www.gurobi.com/documentation/7.0/quickstart_windows/testing_your_license.html
#
#5. Install the R interface of Gurobi   
#   Follow the instructions in: http://www.gurobi.com/documentation/7.0/quickstart_windows/r_installing_the_r_package.html
#   * In Windows, in R run the command install.packages("PATH\\gurobi_7.X-Y.zip", repos=NULL) where path leads to the file gurobi_7.X-Y.zip (for example PATH=C:\\gurobi702\\win64\\R; note that the path may be different in your computer), and "7.X-Y" refers to the version you are installing.
#   * In MAC, in R run the command install.packages('PATH/gurobi_7.X-Y.tgz', repos=NULL) where path leads to the file gurobi_7.X-Y.tgz (for example PATH=/Library/gurobi702/mac64/R; note that the path may be different in your computer), and "7.X-Y" refers to the version you are installing.
#       
#6. Test the installation 
#   Load the library and run the examples therein
#   * A possible error that you may get is the following: "Error: package ‘slam’ required by ‘gurobi’ could not be found". If that case, install.packages('slam') and try again.
#   You should be all set!
CONS_SEP_match$tipo_de_plan_res<-ifelse(CONS_SEP_match$tipo_de_plan_res=="1",1,0)

#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:
require(slam)
# Solver options
#default solver is glpk with approximate = 1
#For an exact solution, we strongly recommend using cplex or gurobi as they are much faster than the other solvers, but they do require a license (free for academics, but not for people outside universities)
t_max = 60*60*6
solver = "gurobi" #cplex, glpk, gurobi and symphony
solver = list(name = solver, 
  t_max = t_max, #t_max is a scalar with the maximum time limit for finding the matches.within this time limit, a partial, suboptimal solution is given
  approximate = 0,#. If approximate = 1 (the default), an approximate solution is found via a relaxation of the original integer program. #FEB2021: I dont want to violate some balancing constraints to some extent. Change to 0.
  round_cplex = 0, 
  trace = 1#turns the optimizer output on
  )

#Indicador de tratamiento
t_ind= ifelse(CONS_SEP_match$tipo_de_plan_res=="1",1,0)

#table(is.na(CONS_SEP_match$tipo_de_plan_res))

# Moment balance: constrain differences in means to be at most 0.1 standard deviations apart
#:#:#:#:#:#:#:#:#:#:#:#:#:
#######mom_covs is a matrix where each column is a covariate whose mean is to be balanced
#######mom_tols is a vector of tolerances for the maximum difference in means for the covariates in mom_covs
#######mom_targets is a vector of target moments (e.g., means) of a distribution to be approximated by matched sampling. is optional, but if #######mom_covs is specified then mom_tols needs to be specified too
#######The lengths of mom_tols and mom_target have to be equal to the number of columns of mom_covs
mom_covs = cbind(CONS_SEP_match$edad_al_ing,
                 CONS_SEP_match$fech_ing_num,
                 CONS_SEP_match$edad_ini_cons)
mom_tols = absstddif(mom_covs, t_ind, .15)# original, 0.05, ahora probaré con 0.7
mom = list(covs = mom_covs, tols = mom_tols, targets = NULL)

# Mean balance
covs = cbind(CONS_SEP_match$edad_al_ing,
                 CONS_SEP_match$fech_ing_num,
                 CONS_SEP_match$edad_ini_cons)
meantab(covs, t_ind)
##      Mis      Min      Max   Mean T   Mean C Std Dif P-val
## [1,]   0    14.88    88.84    35.98    35.98       0     1
## [2,]   0 13621.00 18199.00 16445.64 16445.64       0     1
## [3,]   0     5.00    74.00    16.51    16.51       0     1
# Fine balance
#is a matrix where each column is a nominal covariate for fine balance
fine_covs = cbind(CONS_SEP_match$origen_ingreso_mod,
                  CONS_SEP_match$dg_cie_10_rec,
                  CONS_SEP_match$sexo_2,
                  CONS_SEP_match$sus_ini_mod_mvv,
                  CONS_SEP_match$tipo_centro_pub, #cuidado
                  CONS_SEP_match$estado_conyugal_2, 
                  CONS_SEP_match$escolaridad_rec,
                  CONS_SEP_match$freq_cons_sus_prin,
                  CONS_SEP_match$nombre_region,
                  CONS_SEP_match$condicion_ocupacional_corr,
                  #d_match_no_duplicates$evaluacindelprocesoteraputico,
                  CONS_SEP_match$dg_trs_cons_sus_or
)
fine = list(covs = fine_covs)

# 11,448; No. of controls: 11,448"
# 11,452; No. of controls: 11,452"
# 11,459; No. of controls: 11,459" #when I changed tolerance from .0999 to .1999
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#MATCH
start.time <- Sys.time()
set.seed(2125)
out = cardmatch(t_ind, #ES NECESARIO QUE LOS TRATAMIENTOS ESTEN ORDENADOS Y LOS OTROS VECTORES TAMBIËN 
                mom = mom,# ya los definí list(covs = mom_covs, tols = mom_tols, targets = mom_targets), 
          fine = fine, 
          solver = solver)
##   Building the matching problem... 
##   Gurobi optimizer is open... 
##   Finding the optimal matches... 
## Gurobi Optimizer version 9.1.2 build v9.1.2rc0 (win64)
## Thread count: 6 physical cores, 12 logical processors, using up to 12 threads
## Optimize a model with 60 rows, 84755 columns and 1440835 nonzeros
## Model fingerprint: 0x8dff7088
## Variable types: 0 continuous, 84755 integer (84755 binary)
## Coefficient statistics:
##   Matrix range     [1e+00, 2e+04]
##   Objective range  [1e+00, 1e+00]
##   Bounds range     [0e+00, 0e+00]
##   RHS range        [0e+00, 0e+00]
## Found heuristic solution: objective -0.0000000
## Presolve time: 1.90s
## Presolved: 60 rows, 84755 columns, 1440629 nonzeros
## Variable types: 0 continuous, 84755 integer (84755 binary)
## 
## Root relaxation: objective 1.145693e+04, 456 iterations, 0.85 seconds
## 
##     Nodes    |    Current Node    |     Objective Bounds      |     Work
##  Expl Unexpl |  Obj  Depth IntInf | Incumbent    BestBd   Gap | It/Node Time
## 
##      0     0 11456.9250    0   32   -0.00000 11456.9250      -     -    2s
## H    0     0                    2282.0000000 11456.9250   402%     -    7s
##      0     0 11456.9250    0   32 2282.00000 11456.9250   402%     -    8s
## H    0     0                    11456.000000 11456.9250  0.01%     -    8s
##      0     0 11456.9250    0   32 11456.0000 11456.9250  0.01%     -    8s
## 
## Cutting planes:
##   Zero half: 1
## 
## Explored 1 nodes (486 simplex iterations) in 8.76 seconds
## Thread count was 12 (of 12 available processors)
## 
## Solution count 3: 11456 2282 -0 
## 
## Optimal solution found (tolerance 1.00e-04)
## Best objective 1.145600000000e+04, best bound 1.145600000000e+04, gap 0.0000%
##   Optimal matches found
#FEB2021= If I change to bmatch, error can't allocate vector size 3.4gb
end.time <- Sys.time()
time.taken <- end.time - start.time
# Fine balance (note here we are getting an approximate solution)
#for (i in 1:ncol(fine_covs)) {     
#   print(finetab(fine_covs[, i], t_id_1, c_id_1))
#}
# Indices of the treated units and matched controls
t_id_1 = out$t_id  
c_id_1 = out$c_id   
group = out$group_id    
ids_matched<-cbind.data.frame(t_id_1, c_id_1,group)

paste0("No. of treatments: ",table(table(t_id_1)) %>% formatC(big.mark = ","),"; No. of controls: ",table(table(c_id_1))%>% formatC(big.mark = ","))
## [1] "No. of treatments: 11,456; No. of controls: 11,456"
# Fine balance (note here we are getting an approximate solution)
finetab_match1<-data.frame()
for (i in 1:ncol(fine_covs)) {      
    #finetab_match1<- rbind.data.frame(
  finetab(fine_covs[, i], t_id_1, c_id_1)
}

d_match = CONS_SEP_match[c(t_id_1, c_id_1), ]

paste0("Number of duplicated rows: ",d_match %>%  dplyr::group_by(row) %>%  dplyr::mutate(n_row=n()) %>% dplyr::ungroup() %>% dplyr::filter(n_row>1) %>% nrow())
## [1] "Number of duplicated rows: 0"
paste0("Percentage of the selected treatments: ",scales::percent(length(t_id_1)/CONS_SEP_match %>% dplyr::filter(tipo_de_plan_res==1) %>% nrow()))
## [1] "Percentage of the selected treatments: 90%"
paste0("Percentage of the selected controls: ",
       scales::percent(length(c_id_1)/CONS_SEP_match %>% dplyr::filter(tipo_de_plan_res==0) %>% nrow()))
## [1] "Percentage of the selected controls: 16%"
#cuidado, el anterior me encontró más del mismo control para un tratado
#por eso ocuparé el de más abajo.
#EL DE A CONTINUACIÓN ES ERRÓNEO PORQUE ES POR POSICIÓN, NO POR COINCIDENCIA DEL NÚMERO CON LA FILA
#d_match_no_duplicates = CONS_SEP_match[which(CONS_SEP_match$row %in% c(t_id_1, c_id_1)), ]


Explore Results of the Matching


Age at Admission

Figures 9-12. Empirical Cumulative Distribution Functions on the Matched Sample

Figures 9-12. Empirical Cumulative Distribution Functions on the Matched Sample

Age of Onset of Drug Use

Figures 9-12. Empirical Cumulative Distribution Functions on the Matched Sample

Figures 9-12. Empirical Cumulative Distribution Functions on the Matched Sample

Date of Admission

Figures 9-12. Empirical Cumulative Distribution Functions on the Matched Sample

Figures 9-12. Empirical Cumulative Distribution Functions on the Matched Sample


Love plot

Figure 10. Love plot of the Matched Sample in Covariates v/s Unmatched Sample

Figure 10. Love plot of the Matched Sample in Covariates v/s Unmatched Sample


Balance

Table 5a. Covariate Balance in the Variables of Interest
Unadjusted
Adjusted
Variables Nature of Variables SMDs Threshold Variance Ratios Threshold KS SMDs Threshold Variance Ratios Threshold KS
Public Center Binary -1.18 Not Balanced, >0.1 0.51 0.00 Balanced, <0.1 0.00
Occ.Status-Employed Binary -0.95 Not Balanced, >0.1 0.41 0.00 Balanced, <0.1 0.00
Freq Drug Cons-Daily Binary 0.72 Not Balanced, >0.1 0.34 0.00 Balanced, <0.1 0.00
Occ.Status- Unemployed Binary 0.62 Not Balanced, >0.1 0.30 0.00 Balanced, <0.1 0.00
Drug Dependence Binary 0.59 Not Balanced, >0.1 0.22 0.00 Balanced, <0.1 0.00
Freq Drug Cons-2-3d/wk Binary -0.52 Not Balanced, >0.1 0.21 0.00 Balanced, <0.1 0.00
Motive Admission-Assisted Referral Binary 0.48 Not Balanced, >0.1 0.17 0.00 Balanced, <0.1 0.00
Occ.Status- No activity Binary 0.38 Not Balanced, >0.1 0.11 0.00 Balanced, <0.1 0.00
Starting Substance-Alcohol Binary -0.35 Not Balanced, >0.1 0.17 0.00 Balanced, <0.1 0.00
>1 treatment Binary 0.33 Not Balanced, >0.1 0.14 0.23 Not Balanced, >0.1 0.10
ICD-10-Wo/Psych Comorbidity Binary -0.32 Not Balanced, >0.1 0.15 0.00 Balanced, <0.1 0.00
Treatments (#) Contin. 0.31 Not Balanced, >0.1 1.91 Balanced, <2 0.14 0.21 Not Balanced, >0.1 1.47 Balanced, <2 0.10
Marital Status-Married/Shared liv. arr. Binary -0.30 Not Balanced, >0.1 0.13 0.00 Balanced, <0.1 0.00
Date of Admission Contin. -0.29 Not Balanced, >0.1 1.00 Balanced, <2 0.14 -0.15 Not Balanced, >0.1 0.93 Balanced, <2 0.08
Marital Status-Single Binary 0.28 Not Balanced, >0.1 0.14 0.00 Balanced, <0.1 0.00
Motive Admission-Spontaneous Binary -0.27 Not Balanced, >0.1 0.13 0.00 Balanced, <0.1 0.00
Freq Drug Cons-1d/wk or more Binary -0.25 Not Balanced, >0.1 0.05 0.00 Balanced, <0.1 0.00
ICD-10-W/Psych Comorbidity Binary 0.23 Not Balanced, >0.1 0.11 0.00 Balanced, <0.1 0.00
Starting Substance-Marijuana Binary 0.23 Not Balanced, >0.1 0.10 0.00 Balanced, <0.1 0.00
Region-Arica(15) Binary 0.21 Not Balanced, >0.1 0.04 0.00 Balanced, <0.1 0.00
Starting Substance-Cocaine paste Binary 0.20 Not Balanced, >0.1 0.07 0.00 Balanced, <0.1 0.00
Age at Admission Contin. -0.19 Not Balanced, >0.1 0.84 Balanced, <2 0.07 0.06 Balanced, <0.1 0.98 Balanced, <2 0.04
Freq Drug Cons-Less 1d/wk Binary -0.19 Not Balanced, >0.1 0.03 0.00 Balanced, <0.1 0.00
Region-Tarapacá(01) Binary 0.16 Not Balanced, >0.1 0.03 0.00 Balanced, <0.1 0.00
Sex-Women Binary 0.15 Not Balanced, >0.1 0.07 0.00 Balanced, <0.1 0.00
Occ.Status- Not seeking work Binary 0.15 Not Balanced, >0.1 0.02 0.00 Balanced, <0.1 0.00
Motive Admission-Justice Sector Binary -0.13 Not Balanced, >0.1 0.04 0.00 Balanced, <0.1 0.00
Educational Attainment-PS or less Binary 0.12 Not Balanced, >0.1 0.06 0.00 Balanced, <0.1 0.00
Region-Araucanía(09) Binary -0.12 Not Balanced, >0.1 0.02 0.00 Balanced, <0.1 0.00
Region-Magallanes(12) Binary -0.12 Not Balanced, >0.1 0.01 0.00 Balanced, <0.1 0.00
Freq Drug Cons-4-6d/wk Binary -0.11 Not Balanced, >0.1 0.04 0.00 Balanced, <0.1 0.00
Region-Antofagasta(02) Binary 0.11 Not Balanced, >0.1 0.02 0.00 Balanced, <0.1 0.00
Region-Coquimbo(04) Binary -0.10 Not Balanced, >0.1 0.02 0.00 Balanced, <0.1 0.00
ICD-10-Dg. Unknown/under study Binary 0.09 Balanced, <0.1 0.03 0.00 Balanced, <0.1 0.00
Region-Aysén(11) Binary -0.09 Balanced, <0.1 0.01 0.00 Balanced, <0.1 0.00
Region-Ñuble(16) Binary -0.09 Balanced, <0.1 0.01 0.00 Balanced, <0.1 0.00
Age of Onset of Drug Use Contin. -0.09 Balanced, <0.1 0.91 Balanced, <2 0.07 0.00 Balanced, <0.1 1.01 Balanced, <2 0.01
Freq Drug Cons-Did not use Binary -0.08 Balanced, <0.1 0.01 0.00 Balanced, <0.1 0.00
Educational Attainment-HS or less Binary -0.07 Balanced, <0.1 0.03 0.00 Balanced, <0.1 0.00
Educational Attainment-More than HS Binary -0.06 Balanced, <0.1 0.02 0.00 Balanced, <0.1 0.00
Region- Biobío(08) Binary -0.06 Balanced, <0.1 0.02 0.00 Balanced, <0.1 0.00
Region-Valparaíso(05) Binary 0.06 Balanced, <0.1 0.02 0.00 Balanced, <0.1 0.00
Region-Los Lagos(10) Binary -0.04 Balanced, <0.1 0.01 0.00 Balanced, <0.1 0.00
Occ.Status-Inactive Binary -0.04 Balanced, <0.1 0.01 0.00 Balanced, <0.1 0.00
Motive Admission-Other Binary 0.03 Balanced, <0.1 0.01 0.00 Balanced, <0.1 0.00
Region-Atacama(03) Binary -0.03 Balanced, <0.1 0.00 0.00 Balanced, <0.1 0.00
Region-Maule(07) Binary -0.03 Balanced, <0.1 0.01 0.00 Balanced, <0.1 0.00
Region-O’Higgins(06) Binary -0.03 Balanced, <0.1 0.01 0.00 Balanced, <0.1 0.00
Motive Admission-Health Sector Binary -0.02 Balanced, <0.1 0.01 0.00 Balanced, <0.1 0.00
Marital Status-Widower Binary -0.02 Balanced, <0.1 0.00 0.00 Balanced, <0.1 0.00
Occ.Status-Looking 1st job Binary -0.02 Balanced, <0.1 0.00 0.00 Balanced, <0.1 0.00
Starting Substance-Other Binary 0.01 Balanced, <0.1 0.00 0.00 Balanced, <0.1 0.00
Marital Status-Separated/Divorced Binary -0.01 Balanced, <0.1 0.00 0.00 Balanced, <0.1 0.00
Region-Los Ríos(14) Binary -0.01 Balanced, <0.1 0.00 0.00 Balanced, <0.1 0.00
Region-Metropolitana(13) Binary -0.01 Balanced, <0.1 0.01 0.00 Balanced, <0.1 0.00
Starting Substance-Cocaine hydrochloride Binary 0.00 Balanced, <0.1 0.00 0.00 Balanced, <0.1 0.00
Note. Unadjusted (n=84,755) ; Adjusted (n=22,912) ; Total pairs: 11,456


Table 5b. Covariate Balance in the Variables of Interest
Ambulatory Residential p-value class SMD
n 11456 11456
Starting Substance (%) 1.000 <0.001
Alcohol 4767 (41.6) 4767 (41.6)
Cocaine hydrochloride 479 (4.2) 479 (4.2)
Cocaine paste 1887 (16.5) 1887 (16.5)
Marijuana 4029 (35.2) 4029 (35.2)
Other 294 (2.6) 294 (2.6)
Marital Status (%) 1.000 <0.001
Married/Shared living arrangements 2751 (24.0) 2751 (24.0)
Separated/Divorced 1192 (10.4) 1192 (10.4)
Single 7396 (64.6) 7396 (64.6)
Widower 117 (1.0) 117 (1.0)
Educational Attainment (%) 1.000 <0.001
3-Completed primary school or less 3930 (34.3) 3930 (34.3)
2-Completed high school or less 5630 (49.1) 5630 (49.1)
1-More than high school 1896 (16.6) 1896 (16.6)
Age of Onset of Drug Use (median [IQR]) 15.00 [13.00, 17.00] 15.00 [13.00, 17.00] 0.709 nonnorm 0.002
Frequency of use of primary drug (%) 1.000 <0.001
1 day a week or more 271 (2.4) 271 (2.4)
2 to 3 days a week 1324 (11.6) 1324 (11.6)
4 to 6 days a week 1601 (14.0) 1601 (14.0)
Daily 8044 (70.2) 8044 (70.2)
Did not use 83 (0.7) 83 (0.7)
Less than 1 day a week 133 (1.2) 133 (1.2)
Origen de Ingreso (Primera Entrada)/Motive of Admission to Treatment (First Entry) (%) 1.000 <0.001
Spontaneous 4134 (36.1) 4134 (36.1)
Assisted Referral 2072 (18.1) 2072 (18.1)
Other 728 (6.4) 728 (6.4)
Justice Sector 793 (6.9) 793 (6.9)
Health Sector 3729 (32.6) 3729 (32.6)
Psychiatric Comorbidity (%) 1.000 <0.001
Without psychiatric comorbidity 3047 (26.6) 3047 (26.6)
Diagnosis unknown (under study) 2474 (21.6) 2474 (21.6)
With psychiatric comorbidity 5935 (51.8) 5935 (51.8)
Region of the Center (%) 1.000 <0.001
Antofagasta (02) 681 (5.9) 681 (5.9)
Araucanía (09) 160 (1.4) 160 (1.4)
Arica (15) 633 (5.5) 633 (5.5)
Atacama (03) 259 (2.3) 259 (2.3)
Aysén (11) 41 (0.4) 41 (0.4)
Biobío (08) 526 (4.6) 526 (4.6)
Coquimbo (04) 269 (2.3) 269 (2.3)
Los Lagos (10) 371 (3.2) 371 (3.2)
Los Ríos (14) 184 (1.6) 184 (1.6)
Magallanes (12) 31 (0.3) 31 (0.3)
Maule (07) 507 (4.4) 507 (4.4)
Metropolitana (13) 5653 (49.3) 5653 (49.3)
Ñuble (16) 20 (0.2) 20 (0.2)
O’Higgins (06) 513 (4.5) 513 (4.5)
Tarapacá (01) 450 (3.9) 450 (3.9)
Valparaíso (05) 1158 (10.1) 1158 (10.1)
Drug Dependence = TRUE (%) 10422 (91.0) 10422 (91.0) 1.000 <0.001
Public Center = TRUE (%) 3615 (31.6) 3615 (31.6) 1.000 <0.001
Sexo Usuario/Sex of User = Women (%) 3520 (30.7) 3520 (30.7) 1.000 <0.001
Edad a la Fecha de Ingreso a Tratamiento (numérico continuo) (Primera Entrada)/Age at Admission to Treatment (First Entry) (median [IQR]) 31.84 [25.74, 40.25] 32.84 [26.50, 41.04] <0.001 nonnorm 0.060
Fecha de Ingreso a Tratamiento (Numérico)(c)/Date of Admission to Treatment (Numeric)(c) (median [IQR]) 16350.00 [15446.00, 17246.00] 16132.00 [15335.00, 16995.00] <0.001 nonnorm 0.148
Occupational Status (%) 1.000 <0.001
Employed 1768 (15.4) 1768 (15.4)
Inactive 1172 (10.2) 1172 (10.2)
Looking for a job for the first time 20 (0.2) 20 (0.2)
No activity 1411 (12.3) 1411 (12.3)
Not seeking for work 276 (2.4) 276 (2.4)
Unemployed 6809 (59.4) 6809 (59.4)


Figure 13. Love plot of the Matched Sample in Covariates v/s Unmatched Sample

Figure 13. Love plot of the Matched Sample in Covariates v/s Unmatched Sample


We allowed to tolerate fech_ing_num (SMD=0.16), because the date of admission not necessarily had to be strictly balanced, assuming that not every user had to be admitted to treatment in exact dates.

Survival Setting

Bivariate

We selected the first treatments,


irrs<-function(x, y="event", z="person_days",db){
  #x= variable que agrupa
  #y= evento explicado
  #z= person days
  #db= base de datos
  fmla <- as.formula(paste0(y,"~",x))
  fmla2 <- as.formula(paste0(z,"~",x))
assign(paste0("irr_",y,"_por_",x),
       rateratio.test::rateratio.test(
     x=as.numeric(xtabs(fmla, data=get(db)))[c(2,1)],
     n=as.numeric(xtabs(fmla, data=get(db)))[c(2,1)]
    )
   )
return(
  rateratio.test::rateratio.test(
     x=as.numeric(xtabs(fmla, data=get(db)))[c(2,1)],
     n=as.numeric(xtabs(fmla2, data=get(db)))[c(2,1)]
      )
    )
}
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
# CHECK  DUPLICATED ROWS
#CONS_C1_df_dup_SEP_2020%>% 
#  dplyr::filter(hash_key %in% unlist(unique(d_match$hash_key))) %>% 
#  janitor::tabyl(condicion_ocupacional_corr)

# d_match %>% 
    #dplyr::group_by(row) %>% dplyr::mutate(rn_row=row_number()) %>% janitor::tabyl(rn_row)
#22,914

#
#d_match_surv %>% janitor::tabyl(duplicates_filtered,event)
#nrow(ids_matched)/2 =11,457

#CONS_SEP_match %>% dplyr::group_by(hash_key) %>% dplyr::mutate(rn_hash=row_number()) %>% dplyr::ungroup() %>% janitor::tabyl(rn_hash)
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

ids_matched_filter<-
ids_matched %>% 
    dplyr::group_by(t_id_1) %>% 
    dplyr::mutate(rn_id=row_number()) %>% 
    dplyr::ungroup() %>% 
    dplyr::filter(rn_id==1)

ids_matched_rows<-cbind.data.frame("row_t"=CONS_SEP_match[c(t_id_1),"row"],
                        t_id_1,
                        "row_c"=CONS_SEP_match[c(c_id_1),"row"],
                        c_id_1) %>% 
  janitor::clean_names() %>% 
  dplyr::left_join(subset(ids_matched_filter,select=-c_id_1),by="t_id_1")

CONS_C1_df_dup_SEP_2020_irrs_health<-  
d_match %>% 
  dplyr::left_join(CONS_C1_df_dup_SEP_2020[c("row","dias_treat_imp_sin_na", "event", "person_days","fech_egres_num", "person_years","diff_bet_treat")],by="row") %>%
  dplyr::left_join(ids_matched_rows, by=c("row")) %>% 
  dplyr::mutate(group_match=ifelse(!is.na(group),group,NA)) %>% 
  dplyr::select(-rn_id,-group) %>% #glimpse()
  dplyr::rename("row_c"="row_2") %>% 
  dplyr::left_join(ids_matched_rows, by=c("row"="row_2")) %>% 
  dplyr::mutate(t_id_1=ifelse(!is.na(t_id_1.x),t_id_1.x,t_id_1.y)) %>% 
  dplyr::mutate(c_id_1=ifelse(!is.na(c_id_1.x),c_id_1.x,c_id_1.y)) %>% 
  dplyr::mutate(row_c=ifelse(!is.na(row_c),row_c,row.y)) %>% 
  dplyr::mutate(group_match=ifelse(!is.na(group),group,group_match)) %>% 

  dplyr::select(-t_id_1.x,-c_id_1.x,-t_id_1.y,-c_id_1.y,-group,-row.y,-rn_id) %>% #glimpse()
  
  dplyr::mutate(res_drop_out=dplyr::case_when(
  tipo_de_plan_res==1 & abandono_temprano_rec==TRUE ~1,
  TRUE~0)) %>% 
  dplyr::mutate(min_ach=dplyr::case_when(
  evaluacindelprocesoteraputico=="3-Minimum Achievement" ~1,
  TRUE~0)) %>% 
  dplyr::mutate(res_drop_out=factor(res_drop_out)) %>% 
    dplyr::mutate(min_ach=factor(min_ach)) %>% 
  dplyr::mutate(status_censorship=dplyr::case_when(
  motivodeegreso_mod_imp=="Ongoing treatmentt" ~1,
  TRUE~0)) %>% 

  dplyr::mutate(outcome_to_readmission= dplyr::case_when(
                        event==1~ (diff_bet_treat)/365.25,# & grepl("",comp_status)
                        event==0~ (as.numeric(as.Date("2019-11-13"))-fech_egres_num)/365.25)) %>% 
  dplyr::mutate(admission_to_readmission= dplyr::case_when(
                        event==1~ (diff_bet_treat+dias_treat_imp_sin_na)/365.25,# & grepl("",comp_status)
                        event==0~ (as.numeric(as.Date("2019-11-13"))-fech_ing_num)/365.25))
  
# CONS_C1_df_dup_SEP_2020_irrs_health%>% janitor::tabyl(cnt_diagnostico_trs_fisico_irr)
#label(CONS_C1_df_dup_SEP_2020_prev4_explore$dg_fis_anemia) <- "Physical Dg. Anemia"
#   cnt_mod_cie_10_or cnt_otros_probl_at_sm_or

#22,914
#d_match %>% dplyr::group_by(hash_key) %>% dplyr::mutate(rn_hash=row_number()) %>% dplyr::ungroup() %>% nrow()

#27 Y ALGO
#CONS_C1_df_dup_SEP_2020_irrs_health %>% dplyr::group_by(hash_key) %>% dplyr::mutate(rn_hash=row_number()) %>% dplyr::ungroup() %>% nrow()

# HAY UN SEGUNDO TRATAMIENTO PARA 4,565 CASOS
#PARA VER SI HAY MAS DE UN CASO POR USUARIO
#CONS_C1_df_dup_SEP_2020_irrs_health %>% dplyr::group_by(hash_key) %>% dplyr::mutate(rn_hash=row_number()) %>% dplyr::ungroup() %>% janitor::tabyl(rn_hash)

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#irrs_min_ach & irrs_res_early- outcome to readmission  
irrs_early_drop<-irrs(x="abandono_temprano_rec", z="outcome_to_readmission", db="CONS_C1_df_dup_SEP_2020_irrs_health")
irrs_res_plan<-irrs(x="tipo_de_plan_res" ,z="admission_to_readmission", db="CONS_C1_df_dup_SEP_2020_irrs_health")
irrs_res_early<-irrs(x="res_drop_out" ,z="outcome_to_readmission", db="CONS_C1_df_dup_SEP_2020_irrs_health")
irrs_min_ach<-irrs(x="min_ach" , z="outcome_to_readmission", db="CONS_C1_df_dup_SEP_2020_irrs_health")


The incidence rate of readmission was 0.97 (95% IC 0.91-1.03) in users that had at least an early dropout, compared with users that did not have a physical condition at baseline (p= 0.377).


Figure 12. Cum. Hazards to Experience Readmission to SUD Treatment, by Ealy Dropout of Treatment at Baseline

Figure 12. Cum. Hazards to Experience Readmission to SUD Treatment, by Ealy Dropout of Treatment at Baseline


The incidence rate of readmission was 1.49 (95% IC 1.41-1.56) in users that had a residential plan, compared with users that had an ambulatory plan at baseline (p<0.001).


Figure 13. Cum. Hazards to Experience Readmission to SUD Treatment, by Type of Plan at Baseline

Figure 13. Cum. Hazards to Experience Readmission to SUD Treatment, by Type of Plan at Baseline


The incidence rate of readmission was 1.25 (95% IC 1.16-1.34) in users that had a residential plan and an early dropout, compared with the rest of users at baseline (p<0.001).


Figure 14. Cum. Hazards to Experience Readmission to SUD Treatment, whether it was a person in a Residential Treatment with an Early Dropout

Figure 14. Cum. Hazards to Experience Readmission to SUD Treatment, whether it was a person in a Residential Treatment with an Early Dropout


The incidence rate of readmission was 1.16 (95% IC 1.1-1.22) in users that had a minimum achievement of the therapeutic goals, compared with the rest of users at baseline (p<0.001).


Figure 15. Cum. Hazards to Experience Readmission to SUD Treatment, whether it was a person had a Minimum Achievement in Therapeutic Goals

Figure 15. Cum. Hazards to Experience Readmission to SUD Treatment, whether it was a person had a Minimum Achievement in Therapeutic Goals


Multistate


#  dplyr::filter(motivodeegreso_mod_imp!="En curso")%>% #Sacar los tratamientos que estén en curso 


tab1_lab<- paste0('Original C1 Dataset \n(n = ', formatC(nrow(CONS_C1), format='f', big.mark=',', digits=0), ';\nusers: ',formatC(CONS_C1%>% dplyr::distinct(HASH_KEY)%>% nrow(), format='f', big.mark=',', digits=0),')')
tab2_lab<- paste0('C1 Dataset \n(n = ', formatC(nrow(CONS_C1_df_dup_SEP_2020), format='f', big.mark=',', digits=0), ';\nusers: ',formatC(CONS_C1_df_dup_SEP_2020%>% dplyr::distinct(hash_key)%>% nrow(), format='f', big.mark=',', digits=0),')')
tab1_5_lab<- paste0('&#8226; Duplicated entries\\l &#8226; Overlapping treatments of users\\l &#8226; Intermediate events of treatment (continuous referrals)')
tab4_lab<- paste0('Imputed C1 Dataset \n(n = ', formatC(nrow(CONS_C1_df_dup_SEP_2020_match_miss_after_imp_conservados), format='f', big.mark=',', digits=0), ';\nusers: ',formatC(CONS_C1_df_dup_SEP_2020_match_miss_after_imp_conservados%>% dplyr::distinct(hash_key)%>% nrow(), format='f', big.mark=',', digits=0),')')
tab3_5_lab<- paste0('C1 Dataset \n(n = ', formatC(nrow(CONS_C1_df_dup_SEP_2020_match_miss_after_imp_descartados), format='f', big.mark=',', digits=0), ';\nusers: ',formatC(CONS_C1_df_dup_SEP_2020_match_miss_after_imp_descartados%>% dplyr::distinct(hash_key)%>% nrow(), format='f', big.mark=',', digits=0),')')
tab6_lab<- paste0('C1 Matched Sample\nin Treatment Setting \n(n = ', formatC(CONS_C1_df_dup_SEP_2020_match_miss_after_imp_conservados %>% 
  dplyr::filter(hash_key %in% unlist(unique(d_match$hash_key))) %>% nrow(), format='f', big.mark=',', digits=0), ';\nusers: ',formatC(CONS_C1_df_dup_SEP_2020_match_miss_after_imp_conservados %>% 
  dplyr::filter(hash_key %in% unlist(unique(d_match$hash_key))) %>% dplyr::distinct(hash_key)%>% nrow(), format='f', big.mark=',', digits=0),')')

lab_tab<- paste0("  Result of the matching on treatment setting\nNo. of treatments: ",table(table(t_id_1)) %>% formatC(big.mark = ","),"; No. of controls: ",table(table(c_id_1))%>% formatC(big.mark = ","))

#https://stackoverflow.com/questions/46750364/diagrammer-and-graphviz
#https://mikeyharper.uk/flowcharts-in-r-using-diagrammer/
#http://blog.nguyenvq.com/blog/2012/05/29/better-decision-tree-graphics-for-rpart-via-party-and-partykit/
#http://blog.nguyenvq.com/blog/2014/01/17/skeleton-to-create-fast-automatic-tree-diagrams-using-r-and-graphviz/
#https://cran.r-project.org/web/packages/DiagrammeR/vignettes/graphviz-mermaid.html
#https://stackoverflow.com/questions/39133058/how-to-use-graphviz-graphs-in-diagrammer-for-r
#https://subscription.packtpub.com/book/big_data_and_business_intelligence/9781789802566/1/ch01lvl1sec21/creating-diagrams-via-the-diagrammer-package
#https://justlegal.be/2019/05/using-flowcharts-to-display-legal-procedures/
# paste0("No. of treatments: ",table(table(t_id_1)) %>% formatC(big.mark = ","),"; No. of controls: ",table(table(c_id_1))%>% formatC(big.mark = ","))
#
library(DiagrammeR) #⋉
grViz("digraph flowchart {
      # node definitions with substituted label text
      node [fontname = Times, shape = rectangle,fontsize = 9]        
      tab1 [label = '@@1']
      tab2 [label = '@@2']
      tab3 [label = '&#8226;Duplicated entries\\l&#8226;Intermediate events of treatment (continuous referrals)\\l',fontsize = 7]
      tab4 [label = '@@4']
      blank [label = '', width = 0.0001, height = 0.0001]
      blank2 [label = '', width = 0.0001, height = 0.0001]
      blank3 [label = '', width = 0.0001, height = 0.0001]
      tab5 [label = '&#8226;Logically Inconsistent candidates for imputation\\l&#8226;Ties in candidates for imputation\\l',fontsize = 7]
      tab6 [label= '@@6']
      tab7 [label = '&#8226;Matching pairs based on balance of covariates at basline,\\l&#8226;Pairs 1:1\\l',fontsize = 7]
      
      # edge definitions with the node IDs
      tab1 -> blank [arrowhead = none,label='  Data wrangling and normalization process',fontsize = 8];
      blank -> tab3
      blank -> tab2
      tab2 -> blank2 [arrowhead = none];
      blank2 -> tab5 
      blank2 -> tab4 [label='  Result of the imputation of missing values',fontsize = 8];
      tab4 -> blank3 [arrowhead= none];
      blank3-> tab7
      blank3 -> tab6 [label='@@7',fontsize = 8];
            subgraph {
              rank = same; tab3; blank;
            }
            subgraph {
              rank = same; tab5; blank2;
            }
            subgraph {
              rank = same; tab7; blank3;
            }
      }

      [1]:  tab1_lab
      [2]:  tab2_lab
      [3]:  tab1_5_lab
      [4]:  tab4_lab
      [5]:  ''
      [6]:  tab6_lab
      [7]:  lab_tab
      ")
#      {rank=same; 'tab2'' -> tab3 [label='',fontsize = 11]}; #⋉
#CONS_C1_df_dup_SEP_2020_irrs_health
Table 6. Summary descriptives table
Variables Ambulatory Residential Sig.
N=17150 N=15121
Motive of Admission to Treatment (First Entry): <0.001
Spontaneous 6991 (40.8%) 5567 (36.8%)
Assisted Referral 2943 (17.2%) 3089 (20.4%)
Other 955 (5.57%) 918 (6.07%)
Justice Sector 1223 (7.13%) 980 (6.48%)
Health Sector 5038 (29.4%) 4567 (30.2%)
Psychiatric Comorbidity: <0.001
Without psychiatric comorbidity 4713 (27.5%) 3844 (25.4%)
Diagnosis unknown (under study) 3516 (20.5%) 3311 (21.9%)
With psychiatric comorbidity 8921 (52.0%) 7966 (52.7%)
Sexo Usuario/Sex of User: 0.063
Men 11431 (66.7%) 10227 (67.6%)
Women 5719 (33.3%) 4894 (32.4%)
Age at Admission to Treatment 32.7 [26.7;40.7] 33.0 [26.9;41.0] 0.058
Treatment Length (>90): <0.001
FALSE 14033 (81.8%) 12089 (79.9%)
TRUE 3117 (18.2%) 3028 (20.0%)
‘Missing’ 0 (0.00%) 4 (0.03%)
Treatments by User (#): 0.005
1 8853 (51.6%) 7661 (50.7%)
2 4670 (27.2%) 4102 (27.1%)
3 2174 (12.7%) 1924 (12.7%)
4 921 (5.37%) 843 (5.58%)
5 313 (1.83%) 352 (2.33%)
6 157 (0.92%) 155 (1.03%)
7 44 (0.26%) 54 (0.36%)
8 18 (0.10%) 30 (0.20%)
More than one treatment: 0.088
0 8853 (51.6%) 7661 (50.7%)
1 8297 (48.4%) 7460 (49.3%)
Starting Substance: <0.001
Alcohol 7445 (43.4%) 6201 (41.0%)
Cocaine hydrochloride 754 (4.40%) 645 (4.27%)
Cocaine paste 2493 (14.5%) 2375 (15.7%)
Marijuana 6034 (35.2%) 5519 (36.5%)
Other 424 (2.47%) 381 (2.52%)
Marital Status: <0.001
Married/Shared living arrangements 4277 (24.9%) 3470 (22.9%)
Separated/Divorced 1869 (10.9%) 1565 (10.3%)
Single 10819 (63.1%) 9939 (65.7%)
Widower 185 (1.08%) 147 (0.97%)
Educational Attainment: 0.006
3-Completed primary school or less 5243 (30.6%) 4847 (32.1%)
2-Completed high school or less 8909 (51.9%) 7774 (51.4%)
1-More than high school 2998 (17.5%) 2500 (16.5%)
Frequency of use of primary drug: <0.001
1 day a week or more 591 (3.45%) 324 (2.14%)
2 to 3 days a week 2505 (14.6%) 1574 (10.4%)
4 to 6 days a week 2447 (14.3%) 1973 (13.0%)
Daily 10878 (63.4%) 10991 (72.7%)
Did not use 337 (1.97%) 105 (0.69%)
Less than 1 day a week 392 (2.29%) 154 (1.02%)
Public Center: <0.001
FALSE 9896 (57.7%) 10747 (71.1%)
TRUE 7254 (42.3%) 4374 (28.9%)
Minimum Achievement in the Therapeutic Process: <0.001
Ongoing treatment 1168 (6.81%) 660 (4.36%)
Minimum achievement 8440 (49.2%) 6194 (41.0%)
High/Medium achievement 7542 (44.0%) 8267 (54.7%)
Drug Dependence: <0.001
FALSE 2073 (12.1%) 1314 (8.69%)
TRUE 15077 (87.9%) 13807 (91.3%)
Age of Onset of Drug Use 15.0 [14.0;17.0] 15.0 [13.0;17.0] 0.027
Occupational Status: <0.001
Employed 3815 (22.2%) 2008 (13.3%)
Inactive 1878 (11.0%) 1533 (10.1%)
Looking for a job for the first time 32 (0.19%) 23 (0.15%)
No activity 1856 (10.8%) 2133 (14.1%)
Not seeking for work 351 (2.05%) 401 (2.65%)
Unemployed 9218 (53.7%) 9023 (59.7%)
Days of Treatment (missing dates of discharge were replaced with difference from 2019-11-13) 153 [84.0;276] 150 [66.0;277] <0.001
Users with Posterior Treatments (=1): 0.088
0 8853 (51.6%) 7661 (50.7%)
1 8297 (48.4%) 7460 (49.3%)
User’s Days available in the system for the study 408 [145;1175] 401 [151;1093] 0.019
User’s Years available in the system for the study 1.12 [0.40;3.22] 1.10 [0.41;2.99] 0.019
Days of difference between the Next Treatment 347 [137;780] 263 [72.0;688] <0.001
Treatment Successful Completion: <0.001
Ongoing treatment 1168 (6.81%) 660 (4.36%)
Completion 3144 (18.3%) 4368 (28.9%)
Non-completion 12838 (74.9%) 10093 (66.7%)
Cause of Discharge: <0.001
Administrative discharge 1481 (8.64%) 1905 (12.6%)
Early Drop-out 3117 (18.2%) 3028 (20.0%)
Late Drop-out 6041 (35.2%) 2978 (19.7%)
Ongoing treatment 1168 (6.81%) 660 (4.36%)
Referral to another treatment 2199 (12.8%) 2182 (14.4%)
Therapeutic discharge 3144 (18.3%) 4368 (28.9%)
Note. Variables of C1 dataset had to be standardized before comparison;
Continuous variables are presented as Medians and Percentiles 25 and 75 were shown;
Categorical variables are presented as number (%)


After matching, we selected 32,271 treatments (users=22,912).


library(Epi)
#For censored state transitions it can be awkward having to replicate the censoring time for each non-visited state
#https://github.com/stulacy/multistateutils
states_trans<-c("Admission",    "Readmission",  "Readmission2", "Readmission3", "Readmission4")

trans_matrix <- matrix(c(
NA,1,NA,NA,NA,
NA,NA,2,NA,NA,
NA,NA,NA,3,NA,
NA,NA,NA,NA,4,
NA,NA,NA,NA,NA
), nrow=5, ncol=5,byrow=TRUE,dimnames=list(from=states_trans,to=states_trans))

Tot_reg<-
d_match_surv %>% 
    dplyr::select(id, duplicates_filtered, fech_ing_num,fech_egres_num,dias_treat_imp_sin_na,fech_ing_next_treat,tipo_de_plan_res,motivodeegreso_mod_imp,min_achievement,group_match,dup) %>%
    ## Filter cases with 4 or more registries
    #sum(prop.table(table(d_match_surv$dup))[1:3])
    #dplyr::filter(dup<4) %>% 
     nrow()

Less4_reg<-
d_match_surv %>% 
    dplyr::select(id, duplicates_filtered, fech_ing_num,fech_egres_num,dias_treat_imp_sin_na,fech_ing_next_treat,tipo_de_plan_res,motivodeegreso_mod_imp,min_achievement,group_match,dup) %>%
    ## Filter cases with 4 or more registries
    #sum(prop.table(table(d_match_surv$dup))[1:3])
    dplyr::filter(dup<5) %>% 
     nrow()


#All possible paths through the multi-state model can be found here:
boxes.Lexis(trans_matrix, wmult=1.3, hmult=1.3, cex=.9,
             boxpos = list(y = rep(50,5),
                           x = (1:5)*(20)-10), 
            txt.arr=c(expression("1) " *lambda['12']), 
                      expression("2) " *lambda['23']),
                      expression("3) " *lambda['34']),
                      expression("4) " *lambda['45'])
                      ))
title(sub = paste0("No recurring states;\nAbsorbing state: Fourth Readmission (",scales::percent((Less4_reg/Tot_reg),accuracy = 0.1)," of the registries, considering that each registry\n had a time-to-readmission); Other causes of discharge were not events of interest")) ## internal titles


To the first and second states, we subtracted one day if it overlaps with the date of discharge. For the third and the following states, we added one day in case of overlapping dates due to continous treatments.


### diff_bet_treat is the variable that includes time-to-readmission
### AGS: Starts in 0, excepting left truncated cases
### variables should start with time_ & status_
### Transform to years once generated
### Looks that they all share the same objective time
### AGS: If there is a continous state, interval censoring is not necessary 
### 0's are censored status

library(mstate)

d_match_surv_msprep<-
  d_match_surv %>% 
  dplyr::select(id, duplicates_filtered, fech_ing_num,fech_egres_num,dias_treat_imp_sin_na,fech_ing_next_treat,tipo_de_plan_res,motivodeegreso_mod_imp,min_achievement,group_match,dup, dias_treat_imp_sin_na) %>% 
  ## Filter the fifth readmission of registries
  dplyr::filter(dup<6) %>% 
  dplyr::mutate(tipo_de_plan_res=if_else(tipo_de_plan_res=="1",1,0,0)) %>% 
  dplyr::mutate(TD=if_else(motivodeegreso_mod_imp=="Therapeutic discharge",1,0,0)) %>% 
  dplyr::mutate(DWCA=if_else(motivodeegreso_mod_imp %in% c("Early Drop-out","Late Drop-out","Administrative discharge"),1,0,0)) %>% 
  #dplyr::mutate(tipo_de_plan_res_baseline=tipo_de_plan_res) %>% 
  tidyr::pivot_wider(id_cols=c("id","group_match","duplicates_filtered"), names_from=dup, names_sep="_", values_from=c("fech_ing_num","tipo_de_plan_res","TD","DWCA", "dias_treat_imp_sin_na")) %>% #"","motivodeegreso_mod_imp","min_achievement"
  #,"tipo_de_plan_res_baseline"
  dplyr::arrange(id) %>%
  dplyr::select(id, group_match,everything()) %>% 
  #display error if there are more than row per user
  purrr::when(dplyr::group_by(.,id) %>% dplyr::count() %>% filter(n>1) %>% nrow()>0 ~ stop("more than one case by row"), 
              ~.) %>% 
  #22,916 x 20
  #Check overlapped dates
  purrr::when(dplyr::mutate(.,diff_bet_treat1= fech_ing_num_2-fech_ing_num_1)%>% dplyr::filter(diff_bet_treat1<=0)%>% nrow()>0 ~ stop("There are cases with differences different than 0 to 2 days to a variable that should be the same"), 
              ~.) %>%
  purrr::when(dplyr::mutate(.,diff_bet_treat2= fech_ing_num_3-fech_ing_num_2)%>% dplyr::filter(diff_bet_treat2<=0)%>% nrow()>0 ~ stop("There are cases with differences different than 0 to 2 days to a variable that should be the same"), 
              ~.) %>%
  purrr::when(dplyr::mutate(.,diff_bet_treat3= fech_ing_num_4-fech_ing_num_3)%>% dplyr::filter(diff_bet_treat3<=0)%>% nrow()>0 ~ stop("There are cases with differences different than 0 to 2 days to a variable that should be the same"), 
              ~.) %>%
    purrr::when(dplyr::mutate(.,diff_bet_treat4= fech_ing_num_5-fech_ing_num_4)%>% dplyr::filter(diff_bet_treat4<=0)%>% nrow()>0 ~ stop("There are cases with differences different than 0 to 2 days to a variable that should be the same"), 
              ~.) %>%
    dplyr::mutate(Readmission_status=if_else(!is.na(fech_ing_num_2),1,0,0),
                  Readmission2_status=if_else(!is.na(fech_ing_num_3),1,0,0),
                  Readmission3_status=if_else(!is.na(fech_ing_num_4),1,0,0),
                  Readmission4_status=if_else(!is.na(fech_ing_num_5),1,0,0)) %>% 
  
#:#:#:##:#:#:#:#:##:#:#:#:#:##:#:#:#:#:##:#:#:#:#:##:#:#:#:#:##:#:#:#:#:##:#:#:#:#:##:#:#:#:#:##:#:#:#:#:##:#:#:#:#:##:#:#:#:#:##:#:
## 2021-03-24, I had to reespecify times to objective times, in order to avoid further problems
## 2021-05-06, CENSORED TIME IS NOT THE DIFFERENCE BETWEEN THE TIME OF CENSORSIP AND THE TIME OF THE LAST EVENT, IS THE TOTAL DIFFERENCE. THE SUM OF DAYS UNTIL THE FOLLOWUP TIME
  dplyr::mutate( 
  Readmission_time= dplyr::case_when(
        Readmission_status==1~as.numeric(fech_ing_num_2-fech_ing_num_1),
        Readmission_status==0~as.numeric(as.Date("2019-11-13"))-fech_ing_num_1)) %>% 
  dplyr::mutate( 
  Readmission2_time= dplyr::case_when(
        Readmission2_status==1~as.numeric(fech_ing_num_3-fech_ing_num_1),
        Readmission2_status==0~as.numeric(as.Date("2019-11-13"))-fech_ing_num_1)) %>% 
  dplyr::mutate( 
  Readmission3_time= dplyr::case_when(
        Readmission3_status==1~as.numeric(fech_ing_num_4-fech_ing_num_1),
        Readmission3_status==0~as.numeric(as.Date("2019-11-13"))-fech_ing_num_1)) %>% 
  dplyr::mutate( 
  Readmission4_time= dplyr::case_when(
        Readmission4_status==1~as.numeric(fech_ing_num_5-fech_ing_num_1),
        Readmission4_status==0~as.numeric(as.Date("2019-11-13"))-fech_ing_num_1)) %>% 
    
    ## THE USERS THAT  DID NOT REGISTERED AN EVENT WILL COME UP TO THE FINAL TIME OF THE FOLLOW UP
 dplyr::select(
     id, group_match, 
     tipo_de_plan_res_1,tipo_de_plan_res_2, tipo_de_plan_res_3, tipo_de_plan_res_4, 
     Readmission_time, Readmission_status, Readmission2_time, Readmission2_status, 
     Readmission3_time, Readmission3_status, Readmission4_time, Readmission4_status, 
     dias_treat_imp_sin_na_1, dias_treat_imp_sin_na_2, dias_treat_imp_sin_na_3, dias_treat_imp_sin_na_4,
     TD_1, TD_2, TD_3, TD_4, DWCA_1, DWCA_2, DWCA_3, DWCA_4) %>%  
  as.data.frame() 

#:#:#:##:#:#:#:#:##:#:#:#:#:##:#:#:#:#:##:#:#:#:#:##:#:#:#:#:##:#:#:#:#:##:#:#:#:#:##:#:#:#:#:##:#:#:#:#:##:#:#:#:#:##:#:#:#:#:##:#:
#:#:#:##:#:#:#:#:##:#:#:#:#:##:#:#:#:#:##:#:#:#:#:##:#:#:#:#:##:#:#:#:#:##:#:#:#:#:##:#:#:#:#:##:#:#:#:#:##:#:#:#:#:##:#:#:#:#:##:#:
tail(d_match_surv_msprep) %>% 
      knitr::kable(format= "html", format.args= list(decimal.mark= ".", big.mark= ","),
               caption="Table 8. Data in Wide, Ten-states",
               align= c("c",rep('c', 5)))%>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"),font_size= 13)%>% 
  kableExtra::add_footnote("Note= Proportions from the initial state") %>% 
  kableExtra::scroll_box(width = "100%", height = "350px")
Table 8. Data in Wide, Ten-states
id group_match tipo_de_plan_res_1 tipo_de_plan_res_2 tipo_de_plan_res_3 tipo_de_plan_res_4 Readmission_time Readmission_status Readmission2_time Readmission2_status Readmission3_time Readmission3_status Readmission4_time Readmission4_status dias_treat_imp_sin_na_1 dias_treat_imp_sin_na_2 dias_treat_imp_sin_na_3 dias_treat_imp_sin_na_4 TD_1 TD_2 TD_3 TD_4 DWCA_1 DWCA_2 DWCA_3 DWCA_4
22907 22,907 11,449 1 2,177 0 2,177 0 2,177 0 2,177 0 29 0 1
22908 22,908 11,450 1 2,787 0 2,787 0 2,787 0 2,787 0 234 1 0
22909 22,909 11,452 1 1,898 0 1,898 0 1,898 0 1,898 0 471 1 0
22910 22,910 11,454 1 1,989 0 1,989 0 1,989 0 1,989 0 289 1 0
22911 22,911 11,455 1 525 0 525 0 525 0 525 0 56 0 1
22912 22,912 11,456 1 2,417 0 2,417 0 2,417 0 2,417 0 60 0 1
a Note= Proportions from the initial state
invisible("No se si debiera transformarlo a años. Tal vez a meses. Si lo transformo, me darán esas extrapolaciones bizarras del artículo anterior")


ms_d_match_surv <- mstate::msprep(time = c(NA, "Readmission_time", "Readmission2_time", "Readmission3_time", "Readmission4_time"), 
                  status = c(NA, "Readmission_status", "Readmission2_status", "Readmission3_status", "Readmission4_status"), 
                                            data = d_match_surv_msprep,
                                            id = "id",
                                            trans = trans_matrix,
                                            keep =  c(paste0("tipo_de_plan_res_",1:4), paste0("TD_",1:4),paste0("DWCA_",1:4),paste0("dias_treat_imp_sin_na_",1:4)))

#From starting state 1, subject 66 74 19717 has smallest transition time with status=0
#Everyne has an infinite number in the transition. A good exmple is the user 19717. Only experienced a therapeutic discharge, but in the time from readmission it starts on 910 but ends in INf
#Starting from state 1, simultaneous transitions possible for subjects 36666 36586 56465 136847 37595 60609 51706 76376 117544 140210 at times 126 472 32 36 1 203 45 14 5 71; smallest receiving state chosen
invisible(c("This problem responds to differences between treatments 0. Should be resolved in the initial stages"))
if(no_mostrar==1){
  d_match_surv_msprep %>% 
    dplyr::filter(id %in% unlist(
       ms_d_match_surv%>% 
        dplyr::filter(Tstop<=Tstart) %>% 
        dplyr::select(id,from,to,trans,Tstart,Tstop,time,status) %>% 
        distinct(id))) %>%
    #dplyr::mutate(diff_bet_treat=fech_ing_next_treat-fech_egres_num)%>% 
    View()
}

if(no_mostrar==1){
d_match_surv %>% 
    dplyr::rename("id"="row") %>% 
    dplyr::filter(id %in% unlist(
        ms2_CONS_C1_SEP_2020_women_imputed %>% 
            dplyr::filter(Tstop<=Tstart) %>% 
            dplyr::select(id,from,to,trans,Tstart,Tstop,time,status) %>% 
            distinct(id))) %>%
    dplyr::select(id, motivodeegreso_mod_imp, contains("fech"))
}
path<-rstudioapi::getSourceEditorContext()$path
if (grepl("CISS Fondecyt",path)==T){
    dta_path<-paste0("C:/Users/CISS Fondecyt/Mi unidad/Alvacast/SISTRAT 2019 (github)/")
  } else if (grepl("andre",path)==T){
    dta_path<-paste0('C:/Users/andre/Desktop/SUD_CL/')
  } else if (grepl("E:",path)==T){
    dta_path<-paste0("E:/Mi unidad/Alvacast/SISTRAT 2019 (github)/")
  } else {
    dta_path<-paste0("G:/Mi unidad/Alvacast/SISTRAT 2019 (github)/")
  }

rio::export(
d_match_surv_msprep %>% 
      dplyr::select(
      id, group_match,Readmission_status, Readmission2_status, Readmission3_status, Readmission4_status,
      Readmission_time, Readmission2_time, Readmission3_time, Readmission4_time,
      tipo_de_plan_res_1,tipo_de_plan_res_2, tipo_de_plan_res_3, tipo_de_plan_res_4,
      dias_treat_imp_sin_na_1, dias_treat_imp_sin_na_2, dias_treat_imp_sin_na_3, dias_treat_imp_sin_na_4,
      TD_1, TD_2, TD_3, TD_4, DWCA_1, DWCA_2, DWCA_3, DWCA_4), 
paste0(dta_path,"_mult_state_ags/five_st_msprep_jun.dta"))

rio::export(
d_match_surv_msprep %>% 
  rename_with(~ c("group.match","Readmission.status", "Readmission2.status", "Readmission3.status", "Readmission4.status",
      "Readmission.time", "Readmission2.time", "Readmission3.time", "Readmission4.time"), c("group_match", "Readmission_status", "Readmission2_status", "Readmission3_status", "Readmission4_status",
      "Readmission_time", "Readmission2_time", "Readmission3_time","Readmission4_time")) %>% 
      dplyr::select(
      id, group.match,Readmission.status, Readmission2.status, Readmission3.status, Readmission4.status,
      Readmission.time, Readmission2.time, Readmission3.time, Readmission4.time,
      tipo_de_plan_res_1,tipo_de_plan_res_2, tipo_de_plan_res_3, tipo_de_plan_res_4,
      dias_treat_imp_sin_na_1, dias_treat_imp_sin_na_2, dias_treat_imp_sin_na_3, dias_treat_imp_sin_na_4,
      TD_1, TD_2, TD_3, TD_4, DWCA_1, DWCA_2, DWCA_3, DWCA_4,
      ), 
paste0(dta_path,"_mult_state_ags/five_st_msprep_jun.csv"))


tab9_f<-
data.frame(events(ms_d_match_surv)$Frequencies) %>% 
    dplyr::filter(to!="total entering") %>% 
    left_join(data.frame(events(ms_d_match_surv)$Proportions), by=c("from", "to")) %>% 
    dplyr::rename("Frequencies"="Freq.x", "Proportions"="Freq.y") %>% 
    dplyr::arrange(from, to) %>% 
    dplyr::mutate(diff=ifelse(as.character(from)!=as.character(to),0,1)) %>% 
    dplyr::filter(diff==0) %>%
    dplyr::select(-diff) %>% 
    dplyr::mutate(comb=paste0(from,"_",to)) %>% 
    dplyr::filter(comb %in% c("Admission_Readmission", "Readmission_Readmission2","Readmission2_Readmission3","Readmission3_Readmission4","Readmission4_Readmission5")) %>% 
    dplyr::select(-comb) %>% 
    dplyr::mutate(Proportions=scales::percent(Proportions))

tab9_f %>% 
  dplyr::left_join(data.frame(events(ms_d_match_surv)$Frequencies) %>% 
    dplyr::filter(to=="total entering") %>% dplyr::select(from,Freq),by="from") %>% 
  dplyr::select(from, to, Frequencies, Freq, Proportions)%>% 
  dplyr::rename("Total"="Freq") %>% 
    knitr::kable(format= "html", format.args= list(decimal.mark= ".", big.mark= ","),
               caption="Table 9. Empirical State Transition Matrix, Recurrent Events Model",
               align= c("c",rep('c', 5)))%>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"),font_size= 11)%>% 
  kableExtra::add_footnote("Note. No event describes cases that remained in the state. Percentage depicts the proportion of the state of origin.") %>% 
  kableExtra::scroll_box(width = "100%", height = "350px")
Table 9. Empirical State Transition Matrix, Recurrent Events Model
from to Frequencies Total Proportions
Admission Readmission 6,398 22,912 27.92%
Readmission Readmission2 2,012 6,398 31.45%
Readmission2 Readmission3 646 2,012 32.11%
Readmission3 Readmission4 205 646 31.73%
a Note. No event describes cases that remained in the state. Percentage depicts the proportion of the state of origin.


ms_d_match_surv_res<-
  #El arrival y el número al lado del arrival repreenta el número de la transición
  #mstate::expand.covs(ms_d_match_surv, "arrival", append = TRUE, longnames =F) %>% 
  ms_d_match_surv %>% 
  data.frame() %>%
  dplyr::mutate(tipo_de_plan_res=dplyr::case_when(tipo_de_plan_res_1==1 & trans==1~1,
                                                  tipo_de_plan_res_2==1 & trans==2~1,
                                                  tipo_de_plan_res_3==1 & trans==3~1,
                                                  tipo_de_plan_res_4==1 & trans==4~1,
                                                  T~0)) %>% 
    dplyr::mutate(TD=dplyr::case_when(TD_1==1 & trans==1~1,
                                                  TD_2==1 & trans==2~1,
                                                  TD_3==1 & trans==3~1,
                                                  TD_4==1 & trans==4~1,
                                                  T~0)) %>% 
    dplyr::mutate(TD=dplyr::case_when(TD_1==1 & trans==1~1,
                                                  TD_2==1 & trans==2~1,
                                                  TD_3==1 & trans==3~1,
                                                  TD_4==1 & trans==4~1,
                                                  T~0)) %>% 
    dplyr::mutate(days_treated=dplyr::case_when(!is.na(dias_treat_imp_sin_na_1) & trans==1~dias_treat_imp_sin_na_1,
                                                !is.na(dias_treat_imp_sin_na_2) & trans==2~dias_treat_imp_sin_na_2,
                                                !is.na(dias_treat_imp_sin_na_3) & trans==3~dias_treat_imp_sin_na_3,
                                                !is.na(dias_treat_imp_sin_na_4) & trans==4~dias_treat_imp_sin_na_4,
                                                  T~NA_real_)) 

ms_d_match_surv$tipo_de_plan_res<-ms_d_match_surv_res$tipo_de_plan_res
ms_d_match_surv$TD<-ms_d_match_surv_res$TD
ms_d_match_surv$days_treated<-ms_d_match_surv_res$days_treated


Consideration of the Appropriateness of the proportional hazards assumption

Continuous variables need to be categorized into groups. The plot described is also known as the log(−log(survival)) plot, as the cumulative hazard is equal to the negative logarithm of the survival proportion. This approach requires a subjective assessment (Bradburn, Clark, Love, et al., 2003).

#Bradburn, M., Clark, T., Love, S. et al. Survival Analysis Part III: Multivariate data analysis – choosing a model and assessing its adequacy and fit. Br J Cancer 89, 605–611 (2003). https://doi.org/10.1038/sj.bjc.6601120

plots<- data.frame(title=rep(
  c("Admission to Readmission", "Readmission to Readmission2", "Readmission2 to Readmission3", "Readmission3 to Readmission4" ),1),trans=rep(1:max(trans_matrix,na.rm=T),1))

## SIN COVARIABLES
layout(matrix(1:4, nc = 2, byrow = F))
for(i in c(1:max(trans_matrix,na.rm=T))){
plot(log(survfit(Surv(time,status)~1, data=subset(ms_d_match_surv, trans==plots[i,"trans"] & tipo_de_plan_res_1==0))$time), 
     log(-log(survfit(Surv(time,status)~1, data=subset(ms_d_match_surv, trans==plots[i,"trans"] & tipo_de_plan_res_1==0))$surv)), type="l",
     xlab="log(Days)", ylab="", xaxs="i",yaxs="i",
     las=1,cex.lab=.5, cex.axis=.5)
lines(log(survfit(Surv(time,status)~1, data=subset(ms_d_match_surv, trans==plots[i,"trans"] & tipo_de_plan_res_1==1))$time), 
      log(-log(survfit(Surv(time,status)~1, data=subset(ms_d_match_surv, trans==plots[i,"trans"] & tipo_de_plan_res_1==1))$surv)), lty=2)
legend(7,-4, c("OUT", "RES"), bty="n", lty=c(2,1), cex=.5)
title(main=paste0(plots[i,"title"]), cex.main=.8)
}
Figure 18a. LOG CUMULATIVE HAZARD VS LOG TIME PLOT (w/o covars)

Figure 18a. LOG CUMULATIVE HAZARD VS LOG TIME PLOT (w/o covars)

layout(matrix(1:4, nc = 2, byrow = F))

for(i in c(1:max(trans_matrix,na.rm=T))){
plot(survfit(Surv(time,status)~1, data=subset(ms_d_match_surv, trans==plots[i,"trans"] & tipo_de_plan_res_1==0))$time, 
     -log(survfit(Surv(time,status)~1, data=subset(ms_d_match_surv, trans==plots[i,"trans"] & tipo_de_plan_res_1==0))$surv), type="l",
     xlab="Days", ylab="", xaxs="i",yaxs="i", 
     las=1,cex.lab=.5, cex.axis=.5, col=1)
lines(survfit(Surv(time,status)~1, data=subset(ms_d_match_surv, trans==plots[i,"trans"] & tipo_de_plan_res_1==1))$time, 
      -log(survfit(Surv(time,status)~1, data=subset(ms_d_match_surv, trans==plots[i,"trans"] & tipo_de_plan_res_1==1))$surv), lty=2)
legend(2000,.1, c("OUT", "RES"), bty="n", lty=c(2,1), cex=.5)
title(main=paste0(plots[i,"title"]), cex.main=.8)
}
Figure 18b. CUMULATIVE HAZARD PLOT: -LOG(KM SURVIVAL) (w/o covars)

Figure 18b. CUMULATIVE HAZARD PLOT: -LOG(KM SURVIVAL) (w/o covars)

As seen in both Figures above, the cumulative hazards does not follow a proportional trend in the four transitions.


Decision whether to use Markov or Semi-Markov


#state arrival extended (semi-)Markov to mean that the i → j transition hazard depends on thetime of arrival at state i. 

#Build a Cox proportional hazard model including treatment and time in previous state as covariates

tab_cox_markov<- data.frame()
for (i in c(2:max(trans_matrix,na.rm=T))){
coxph(Surv(Tstart,Tstop,status)~factor(tipo_de_plan_res_1)+Tstart,
                  data=subset(ms_d_match_surv_res, trans==i),method = "breslow") %>% 
    assign(paste0("CoxMarkov",i),.,envir=.GlobalEnv)
  round(exp(coef(get(paste0("CoxMarkov",i)))),2)%>% assign(paste0("HR",i),.,envir=.GlobalEnv)
  round(exp(confint(get(paste0("CoxMarkov",i)))),2)%>% assign(paste0("CI",i),.,envir=.GlobalEnv)
  round(coef(summary(get(paste0("CoxMarkov",i))))[,5],4)%>% assign(paste0("P",i),.,envir=.GlobalEnv)
  data.frame(get(paste0("CI",i))) %>% dplyr::rename("Lower 95% CI"="X2.5..","Upper 95% CI"="X97.5..")%>% assign(paste0("CI",i),.,envir=.GlobalEnv)
  tab_cox_markov_add<- cbind.data.frame(plots[i, "title"],get(paste0("HR",i)),get(paste0("CI",i)),get(paste0("P",i)))
  tab_cox_markov<-rbind.data.frame(tab_cox_markov,tab_cox_markov_add)
}

tab_cox_markov %>% 
  data.table(keep.rownames=T) %>% 
  dplyr::rename("Terms"="rn", "Transition"="plots[i, \"title\"]",
                "HR"="get(paste0(\"HR\", i))","P"="get(paste0(\"P\", i))") %>% 
  dplyr::mutate(Terms=dplyr::case_when(grepl("tipo_de_", Terms)~"Type of Plan (Residential)",
                                    grepl("Tstart",Terms)~"Time in previous state(in days)")) %>% 
  dplyr::mutate(P=ifelse(P<.001,"<.001",sprintf("%1.3f",P))) %>% 
  dplyr::rename("Sig."="P") %>% 
  dplyr::mutate(`95% CIs`=paste0(sprintf("%2.2f",`Lower 95% CI`),", ",sprintf("%2.2f",`Upper 95% CI`))) %>% 
  dplyr::select(-`Lower 95% CI`,-`Upper 95% CI`) %>% 
  dplyr::select(Transition, Terms, HR, `95% CIs`, Sig.) %>% 
      knitr::kable(format= "html", format.args= list(decimal.mark= ".", big.mark= ","),
               caption="Table 10. PH Model incluiding time in previous state & Type of Program as a covariate",
               align= c("c",rep('c', 5)))%>%
  #kableExtra::pack_rows("Three-states", 1, 2) %>% 
  #kableExtra::pack_rows("Four-states", 3, 4) %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"),font_size= 11)%>% 
  kableExtra::scroll_box(width = "100%", height = "350px")
Table 10. PH Model incluiding time in previous state & Type of Program as a covariate
Transition Terms HR 95% CIs Sig.
Readmission to Readmission2 Type of Plan (Residential) 1.03 0.94, 1.13 0.538
Readmission to Readmission2 Time in previous state(in days) 1.00 1.00, 1.00 <.001
Readmission2 to Readmission3 Type of Plan (Residential) 1.00 0.85, 1.17 0.990
Readmission2 to Readmission3 Time in previous state(in days) 1.00 1.00, 1.00 <.001
Readmission3 to Readmission4 Type of Plan (Residential) 1.08 0.81, 1.44 0.610
Readmission3 to Readmission4 Time in previous state(in days) 1.00 1.00, 1.00 0.047
#a variable appears on both the left and right sides of the formula
#this warning should be normal, since we are dealing with time to arrival at a determined state.
#https://github.com/andrewtitman/MarkovTest/blob/master/cox_markov_test.R

trans_matrix_etm <- matrix(c(
F,T,F,F,F,
F,F,T,F,F,
F,F,F,T,F,
F,F,F,F,T,
F,F,F,F,F
), nrow=5, ncol=5,
byrow=TRUE,
dimnames=list(from=1:5,to=1:5))

etm_ms_d_match_surv<-
mstate::msdata2etm(ms_d_match_surv_res, "id", c("tipo_de_plan_res","tipo_de_plan_res_1", "TD_1", "TD_2", "TD_3"))

tmat2Q <- function(tmat)
{
  K <- nrow(tmat)
  P <- tmat
  P[!is.na(P)] <- 1
  P[is.na(P)] <- 0
  diag(P) <- 1
  k <- 1
  Pk <- P
  diag(Pk) <- 0
  Pkprev <- Pk
  Q <- Pk
  for (k in 2:K) {
    Pk <- Pk %*% P
    Pk[Pk > 1] <- 1
    Q <- Q + k * (Pk - Pkprev)
    Pkprev <- Pk
  }
  Q
}

cox_markov_test <- function(data, formula=NULL, 
                            tfrom, 
                            tto, 
                            trans, 
                            grid, 
                            B=1000, 
                            fn = list(function(x) mean(abs(x),na.rm=TRUE)), 
                            fn2 = list(function(x) mean(x,na.rm=TRUE)),
                            dist="poisson") {
  
  #data: dataset in etm format: "entry", "exit", "from", "to", "id". Should also contain the relevant covariates: no factors allowed
  #formula: right-hand side of the formula : If NULL will fit with no covariates (formula="1" will also work), offset terms can also be specified.
  #tfrom: from state in transition of interest
  #tto: to state in transition of interest
  #trans: transition matrix of the underlying model.
  #grid: grid of times s to compute the statistic
  #B: number of wild bootstrap samples to perform
  ###################################
  #fn: a list of summary functions : to be applied to the individual zbar traces. (or list of lists)
  ###################################
  #fn2: a list of summary functions : to be applied to the overall chi-squared trace.
  #dist: Form of wild bootstrap random weights (defaults as centred poisson, alternative is normal(0,1))
  
  
  qualset <- c(tfrom, which(tmat2Q(trans)[,tfrom]>0))
  qualset <- sort(unique(qualset))
  
  #########################
  if (!is.list(fn)) {
    fn<-list(fn) 
  }
  if (is.list(fn) & is.function(fn[[1]])) {
    tempfn <- list()
    for (i in 1:length(qualset)) tempfn[[i]]<-fn
    fn <- tempfn
  }
  if (!is.list(fn2)) fn2<-list(fn2) #Coerce to be list if a single function is provided
  #Establish the relevant patients who ever enter tfrom:
  relpat <- sort(unique(data$id[data$from==tfrom]))
  rdata <- data[data$from==tfrom,] #Only need time periods in the relevant state...
  rdata$status <- 1*(rdata$to==tto)
  if (!is.null(formula)) {
    form <- as.formula(paste("Surv(entry,exit,status)~",formula,sep=""))
    progfit <- coxph(form, data= rdata)
    if (length(progfit$coefficients)>0) {
      Zmat <- as.matrix(rdata[,match(names(progfit$coefficients),names(rdata))])
      Ncov <- dim(Zmat)[2]
    }else{
      Ncov <- 0
    }
    if (!is.null(progfit$offset)) {
      offset <- progfit$offset
    }else{
      offset <- rep(0,dim(rdata)[1])
    }
  }else{
    Ncov <- 0
    offset <- rep(0,dim(rdata)[1])
    progfit <- NULL
  }
  
  progdat <- rdata[,match(c("id","entry","exit","status"),names(rdata))]
  names(progdat) <- c("id","T0","T1","D")
  
  nobs_grid <- sapply(grid,function(x) sum(progdat$D[progdat$T1 > x])) 
  
  #Have the extra dimension of indexes
  index_gM <- array(0,c(length(relpat),length(grid),length(qualset)))
  for (indx in 1:length(qualset)) {
    qualstate <- qualset[indx]
    index_g <- sapply(grid,function(y) sapply(relpat,function(x) which(data$entry < y & data$exit >= y & data$id==x)))
    index_g <- array(1*(data$from[sapply(index_g,function(y) ifelse(length(y)>0,y,dim(data)[1]+1))]==qualstate),c(length(relpat),length(grid)))
    index_g[is.na(index_g)]<-0
    index_gM[,,indx] <- index_g
  }
  
  #Need a separate Z3mat for each group as well...
  Z3mat <- index_gM[match(progdat$id,relpat),,,drop=FALSE]
  N1 <- dim(progdat)[1]
  
  if (Ncov >0 ) {
    LP <- c(Zmat%*%progfit$coefficients) + offset
  }else{
    LP <- rep(0,N1) + offset
  }
  S0 <- sapply(1:N1,function(x) sum(exp(LP)*(progdat$T0 < progdat$T1[x] & progdat$T1 >= progdat$T1[x])))
  
  incr <- progdat$D/S0
  cumhaz <- approxfun(c(0,sort(unique(progdat$T1)),Inf),c(0,cumsum(tapply(incr,progdat$T1,sum)),sum(incr)),method="constant")
  resid_mat <- sapply(grid, function(x) progdat$D*(progdat$T1 > x) - exp(LP)*(cumhaz(pmax(x,progdat$T1)) - cumhaz(pmax(x,progdat$T0))))
  
  #Have a separate trace for each qualifying state...
  obs_trace <- array(0,c(length(grid),length(qualset)))
  for (indx in 1:(length(qualset))) {
    obs_trace[,indx] <- sapply(1:length(grid), function(k) sum(resid_mat[,k]*Z3mat[,k,indx]*(progdat$T1 > grid[k])))
  }
  
  
  nqstate <- length(qualset)
  
  if (Ncov >0) Ifish <- progfit$var
  
  
  N1 <- dim(progdat)[1]
  if (Ncov >0) Zbar0 <- array(0,c(N1,Ncov))
  
  Zbar <- array(0,c(N1,length(grid),nqstate))
  for (i in 1:N1) {
    x <- i
    if (Ncov >0) {
      for (j in 1:Ncov) {
        Zbar0[i,j] <- sum(Zmat[,j] * exp(LP) * (progdat$T0 < progdat$T1[x] & progdat$T1 >= progdat$T1[x]))/sum(exp(LP) * (progdat$T0 < progdat$T1[x] & progdat$T1 >= progdat$T1[x]))
      }
    }
    for (j in 1:length(grid)) {
      for (k in 1:nqstate) Zbar[i,j,k] <- sum(Z3mat[,j,k] * exp(LP) * (progdat$T0 < progdat$T1[x] & progdat$T1 >= progdat$T1[x]))/sum(exp(LP) * (progdat$T0 < progdat$T1[x] & progdat$T1 >= progdat$T1[x]))
    }
  }
  
  NAe <- incr
  
  
  
  if (Ncov > 0) {
    Hmat <- array(0,c(length(grid),Ncov,nqstate))
    for (j in 1:Ncov) {
      for (k in 1:nqstate)  Hmat[,j,k] <- sapply(1:length(grid),function(y) sum(sapply(1:N1,function(x) sum(exp(LP[x]) *  ((Zmat[x,j] -Zbar0[,j])*(Z3mat[x,y,k] - Zbar[,y,k]))* NAe  * (progdat$T1[x] > grid[y]) * (progdat$T1 > progdat$T0[x] & progdat$T1 <= progdat$T1[x])))))
    }
  }
  
  
  if (Ncov >0) {
    multiplier <- array(0,dim(Hmat))
    for (k in 1:nqstate) multiplier[,,k] <- Hmat[,,k]%*%Ifish
    est_cov <- array(0,c(length(grid),nqstate,nqstate))
    for (indx1 in 1:nqstate) {
      for (indx2 in (indx1):nqstate) {
        est_var <- sapply(1:length(grid), function(k) sum(sapply(1:N1,function(v) sum( ((Z3mat[v,k,indx1] - Zbar[,k,indx1])*(progdat$T1 > grid[k]) - c(multiplier[k,,indx1,drop=FALSE]%*%t(Zmat[v,] - Zbar0)))*((Z3mat[v,k,indx2] - Zbar[,k,indx1])*(progdat$T1 > grid[k]) - c(multiplier[k,,indx2,drop=FALSE]%*%t(Zmat[v,] - Zbar0)))*exp(LP[v])*(progdat$T0[v] < progdat$T1 & progdat$T1[v] >= progdat$T1) * NAe))))
        est_cov[,indx1,indx2] <- est_cov[,indx2,indx1] <- est_var 
      }
    }
    
  }else{
    est_cov <- array(0,c(length(grid),nqstate,nqstate))
    for (indx1 in 1:nqstate) {
      for (indx2 in (indx1):nqstate) {
        est_var <- sapply(1:length(grid), function(k) sum(sapply(1:N1,function(v) sum((Z3mat[v,k,indx1] - Zbar[,k,indx1])*(Z3mat[v,k,indx2] - Zbar[,k,indx2])*exp(LP[v])*(progdat$T1 > grid[k] & progdat$T0[v] < progdat$T1 & progdat$T1[v] >= progdat$T1) * NAe))))
        est_cov[,indx1,indx2] <- est_cov[,indx2,indx1] <- est_var 
      }
    }
  }
  
  #First obtain the individually normalized traces...
  est_var <- obs_norm_trace <- array(0,c(length(grid),nqstate))
  for (k in 1:nqstate) {
    est_var[,k] <- est_cov[cbind(1:length(grid),k,k)]
    obs_norm_trace[,k] <- obs_trace[,k]/sqrt(est_var[,k] + 1*(est_var[,k]==0)) #This should be the same as before...
  }
  #Find singular matrices
  obs_chisq_trace <- rep(0,length(grid))
  for (k in 1:length(grid)) {
    sol <- tryCatch(solve(est_cov[k,-1,-1]),error = function(e) return(diag(0,nqstate-1)))
    obs_chisq_trace[k] <- (obs_trace[k,-1])%*%sol%*%(obs_trace[k,-1]) #Do something about singular matrices...
  }
  
  ##############
  
  n_wb_trace <- wb_trace0 <- wb_trace <- array(0,c(B,length(grid),nqstate))
  nch_wb_trace <- array(0,c(B,length(grid)))
  for (wb in 1:B) {
    if (dist=="poisson") {
      G <- rpois(dim(progdat)[1],1) - 1
    }else{
      G <- rnorm(dim(progdat)[1],0,1)
    }
    trace0 <- array(0,c(length(grid),nqstate))
    for (k in 1:nqstate) {
      trace0[,k] <- apply(sapply(1:length(grid), function(x) progdat$D * (Z3mat[,x,k] - Zbar[,x,k]) *(progdat$T1 > grid[x])*G  ),2,sum)
      if (Ncov >0) {
        Imul <- sapply(1:Ncov, function(x) sum(progdat$D * (Zmat[,x] - Zbar0[,x]) * G))
        trace1 <- (Hmat[,,k]%*%Ifish%*%Imul)[,1]
      }else{
        trace1 <-0
      }
      wb_trace[wb,,k] <- trace0[,k] - trace1 
      n_wb_trace[wb,,k] <- wb_trace[wb,,k]/sqrt(est_var[,k] + 1*(est_var[,k] ==0 ))
      for (w in 1:length(grid)){
        sol <- tryCatch(solve(est_cov[w,-1,-1]),error = function(e) return(diag(0,nqstate-1)))
        nch_wb_trace[wb,w] <- (wb_trace[wb,w,-1])%*%sol%*%(wb_trace[wb,w,-1]) #Do something about singular matrices...
      }
      
    }
  }
  
  #Need to have one of these per nqstate
  NS <- length(fn[[1]])
  
  orig_stat <- array(sapply(1:nqstate,function(y) sapply(fn[[y]],function(g) g(obs_norm_trace[,y]))),c(NS,nqstate))
  orig_ch_stat <- sapply(fn2,function(g) g(obs_chisq_trace))
  
  p_stat_wb <- array(0,c(NS,nqstate))
  wb_stat <- array(0,c(B,NS,nqstate))
  for (k in 1:nqstate) {
    wb_stat[,,k] <- array(t(apply(n_wb_trace[,,k,drop=FALSE],1,function(x) sapply(fn[[k]],function(g) g(x)))),c(B,NS))
    p_stat_wb[,k] <- sapply(1:NS, function(x) mean(wb_stat[,x,k] > orig_stat[x,k]))
  }
  est_quant <- array(0,c(2,length(grid),nqstate))
  for (k in 1:nqstate) est_quant[,,k] <- apply(n_wb_trace[,,k,drop=FALSE],2,quantile,c(0.025,0.975),na.rm=TRUE)
  NS2 <- length(fn2)
  p_ch_stat_wb <- rep(0,NS2)
  wb_ch_stat <- array(t(apply(nch_wb_trace,1,function(x) sapply(fn2,function(g) g(x)))),c(B,NS2))
  p_ch_stat_wb <- sapply(1:NS2, function(x) mean(wb_ch_stat[,x] > orig_ch_stat[x]))
  
  #orig_stat: summary statistic for each of the starting states
  #orig_ch_stat: overall chi-squared summary statistic
  #p_stat_wb: p-values corresponding to each of the summary statistics for each starting state
  #p_ch_stat_wb: p-values for overall chi=squared summary statistics
  #b_stat_wb: bootstrap summary statistics for each of the starting states
  #zbar: individual traces for each of the starting states
  #nobs_grid: the number of events after time s for each s in the grid
  #Nsub: number of patients who are ever at risk of the transition of interest
  #est_quant: pointwise 2.5% and 97.5% quantile limits for each of the traces
  #obs_chisq_trace: trace of the chi-squared statistic.
  #nch_wb_trace: individual values of the chi-squared statistic trace for the wild bootstrap samples
  #n_wb_trace: individual values of the log-rank z statistic traces for the wild bootstrap samples
  #est_cov: estimated covariance matrix between the log-rank statistics at each grid point
  #qualset: qualifying states corresponding to the components of the above traces.
  #coxfit: fitted coxph object
  return(list(orig_stat = orig_stat ,orig_ch_stat = orig_ch_stat, p_stat_wb = p_stat_wb , p_ch_stat_wb = p_ch_stat_wb, b_stat_wb = wb_stat, zbar = obs_norm_trace, nobs_grid = nobs_grid, Nsub=length(relpat),
              est_quant=est_quant,obs_chisq_trace=obs_chisq_trace,nch_wb_trace=nch_wb_trace,n_wb_trace=n_wb_trace,est_cov=est_cov,qualset=qualset,coxfit=progfit))
} 


#Create a function that implements the proposed weighting for the chi-squared trace
weights_multiple <- function(data,grid,from,to,min_time=0) {
  numbers <- sapply(grid,function(x) table(factor(data$from)[(data$entry <= x & data$exit >x)]))
  subevent <- sapply(grid,function(x) sum(data$from==from & data$to==to & data$exit >x))
  tnumbers <- apply(numbers,2,sum)
  weights <- sapply(1:dim(numbers)[1], function(x) subevent*numbers[x,]*(tnumbers - numbers[x,])/tnumbers^2)
  weights[is.nan(weights)]<-0
  weight <- apply(weights,1,max)
  weight*diff(c(min_time,grid))
}

weights_matrix <- function(data,grid,from,to,min_time=0,other_weights=NULL) {
  numbers <- sapply(grid,function(x) table(factor(data$from)[(data$entry <= x & data$exit >x)]))
  subevent <- sapply(grid,function(x) sum(data$from==from & data$to==to & data$exit >x))
  tnumbers <- apply(numbers,2,sum)
  weights <- sapply(1:dim(numbers)[1], function(x) sqrt(subevent*numbers[x,]*(tnumbers - numbers[x,]))/tnumbers)
  weights[is.nan(weights)]<-0
  fn_list <- list()
  for (i in 1:dim(numbers)[1]) {
    #Take into account the distance between grids
    val <- weights[,i]*diff(c(min_time,grid))
    fn_list[[i]] <- list(fn=function(x) weighted.mean(abs(x),w=val,na.rm=TRUE))
    if (!is.null(other_weights)) {
      nother <- length(other_weights)
      fn_list[[i]][2:(nother+1)] <- other_weights
    }
  }
  #Store the weights as an attribute.
  attr(fn_list,"weights")<-weights
  fn_list
}

#_#_#_#__#_#_#_#_#_#__###### markov test ##### _#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

#Time grid
tseq <- seq(1,1827,by=10) 

#Three approaches to testing are considered; i) A simple method based on including 
#time of entry into the state as a covariate in a Cox model for each transition 
#intensity ii) Use of the stratified version of the Commenges-Andersen test 2 
#for a univariate frailty, and iii) A novel class of tests based on families of 
#log-rank statistics, where patients are grouped by their state occupancy at landmark times.

start_time <- Sys.time()

cox_markov_test_prueba12<-
  cox_markov_test(etm_ms_d_match_surv, formula="tipo_de_plan_res_1 + TD_1", tfrom=1 , tto=2, grid=tseq, trans=trans_matrix_etm, B=1000)

cox_markov_test_prueba23<-
  cox_markov_test(etm_ms_d_match_surv, formula="tipo_de_plan_res_1 + TD_1", tfrom=2 , tto=3, grid=tseq, trans=trans_matrix_etm, B=1000)

cox_markov_test_prueba34<-
  cox_markov_test(etm_ms_d_match_surv, formula="tipo_de_plan_res_1 + TD_1", tfrom=3 , tto=4, grid=tseq, trans=trans_matrix_etm, B=1000)

cox_markov_test_prueba45<-
  cox_markov_test(etm_ms_d_match_surv, formula="tipo_de_plan_res_1 + TD_1", tfrom=4 , tto=5, grid=tseq, trans=trans_matrix_etm, B=1000)


cox_markov_test_prueba12d<-
  cox_markov_test(etm_ms_d_match_surv, formula="tipo_de_plan_res + TD_1", tfrom=1 , tto=2, grid=tseq, trans=trans_matrix_etm, B=1000)

cox_markov_test_prueba23d<-
  cox_markov_test(etm_ms_d_match_surv, formula="tipo_de_plan_res + TD_1 + TD_2", tfrom=2 , tto=3, grid=tseq, trans=trans_matrix_etm, B=1000)

cox_markov_test_prueba34d<-
  cox_markov_test(etm_ms_d_match_surv, formula="tipo_de_plan_res + TD_1+ TD_2+ TD_3", tfrom=3 , tto=4, grid=tseq, trans=trans_matrix_etm, B=1000)

cox_markov_test_prueba45d<-
  cox_markov_test(etm_ms_d_match_surv, formula="tipo_de_plan_res + TD_1+ TD_2+ TD_3+ TD4", tfrom=4, tto=5, grid=tseq, trans=trans_matrix_etm, B=1000)

end_time <- Sys.time()

print("Time taken in process")
end_time - start_time
#It should be noted in this context that even when the Markov assumption is not satisfied, theAJ estimator
#may have smaller mean squared error than the LMAJ estimator, as shown in simulation studies (Putter and
#Spitoni, 2018). It is the familiar bias-variance trade-off, where for smaller sample size variance tends to
#dominate—in favor of AJ—and for larger sample size bias tends to dominate—in favor of LMAJ. Using
##this work as a pre-test fits in nicely within this framework; larger sample size will have more power to
#detect violations of the Markov assumption, suggesting to use robust methods. More work is needed to
#study how this works out in practice.

require(lattice)

plot.MarkovTest <- function(x, y, what=c("states", "overall"), idx=NULL, quantiles=TRUE, qsup, states,
                            xlab, ylab, main, ...)
{
  what <- match.arg(what)
  B <- dim(x$n_wb_trace)[1]
  ny <- length(y)
  if (missing(xlab)) xlab <- "Time"
  if (missing(ylab)) ylab <- "Test statistic"
  if (missing(main)) main <- ""
  if (what=="states") {
    # dfr <- x$zbar
    qualset <- x$qualset
    J <- length(qualset)
    dfr <- data.frame(time=rep(tseq, J), zbar=as.numeric(x$zbar), qualstate=rep(qualset, each=ny), ct=0)
    lwd <- 2
    lty <- 1
    col <- 1
    if (quantiles) {
      dfrl1 <- data.frame(time=rep(tseq, J), zbar=as.numeric(x$est_quant[1, , ]), qualstate=rep(qualset, each=ny), ct=1)
      dfru1 <- data.frame(time=rep(tseq, J), zbar=as.numeric(x$est_quant[2, , ]), qualstate=rep(qualset, each=ny), ct=3)
      dfr <- rbind(dfr, dfrl1, dfru1)
      lwd <- c(lwd, 2, 2)
      lty <- c(lty, 3, 3)
      col <- c(col, 1, 1)
    }
    if (!missing(qsup)) {
      if (qsup %in% 1:dim(x$b_stat_wb)[2]) {
        q95 <- apply(x$b_stat_wb[, qsup, ], 2, quantile, 0.95)
        print(q95)
        dfrl2 <- data.frame(time=rep(tseq, J), zbar=rep(-q95, each=ny), qualstate=rep(qualset, each=ny), ct=2)
        dfru2 <- data.frame(time=rep(tseq, J), zbar=rep(q95, each=ny), qualstate=rep(qualset, each=ny), ct=4)
        dfr <- rbind(dfr, dfrl2, dfru2)
        lwd <- c(lwd, 2, 2)
        lty <- c(lty, 3, 3)
        col <- c(col, 1, 1)
      }
    }
    if (!is.null(idx)) {
      idx <- intersect(1:B, idx)
      nB <- length(idx)
      if (nB > 0) {
        dfrb <- data.frame(time=rep(rep(y, J), each=nB),
                           zbar=as.numeric(x$n_wb_trace[idx, , ]),
                           qualstate=rep(qualset, each=ny*nB),
                           ct=rep(-idx, ny*J))
        dfr <- rbind(dfrb, dfr)
        lwd <- c(rep(0.5, nB), lwd)
        lty <- c(rep(1, nB), lty)
        col <- c(rep(8, nB), col)
      }
    }
    # print(dim(dfr))
    if (missing(states)) dfr$qualstate <- factor(dfr$qualstate)
    else dfr$qualstate <- factor(dfr$qualstate, levels=qualset, labels=states[qualset])
    xyplot(zbar ~ time | qualstate, data=dfr, groups=ct, lwd=lwd, type="l", col=col, lty=lty,
           xlab=xlab, ylab=ylab, main=main)
  }
  else if (what=="overall") {
    dfr <- data.frame(time=y, zbar=as.numeric(x$obs_chisq_trace), ct=0)
    lwd <- 2
    lty <- 1
    col <- 1
    if (quantiles) {
      
      dfru <- data.frame(time=y, zbar=apply(x$nch_wb_trace, 2, quantile, probs=0.95), ct=-1)
      dfr <- rbind(dfr, dfru)
      lwd <- c(2, lwd)
      lty <- c(3, lty)
      col <- c(1, col)
    }
    if (!is.null(idx)) {
      idx <- intersect(1:B, idx)
      nB <- length(idx)
      if (nB > 0) {
        dfrb <- data.frame(time=rep(y, each=nB),
                           zbar=as.numeric(x$nch_wb_trace[idx, ]),
                           ct=rep(idx, ny))
        dfr <- rbind(dfrb, dfr)
        lwd <- c(lwd, rep(0.5, nB))
        lty <- c(lty, rep(1, nB))
        col <- c(col, rep(8, nB))
      }
    }
    # print(dim(dfr))
    # print(dfr)
    xyplot(zbar ~ time, data=dfr, groups=ct, lwd=lwd, type="l", col=col, lty=lty,
           xlab=xlab, ylab=ylab, main=main)
  }
}
#$    tipo_de_plan_res + TD_1 - "tipo_de_plan_res_1 + TD_1"


plot.MarkovTest(cox_markov_test_prueba12, tseq, what="states",idx=1:50, 
                states=colnames(trans_matrix),
                xlab="Days since arrival", ylab="Log-rank test statistic", main="Adm-> Readm")
## Error in plot.MarkovTest(cox_markov_test_prueba12, tseq, what = "states", : objeto 'cox_markov_test_prueba12' no encontrado
plot.MarkovTest(cox_markov_test_prueba23, tseq, what="states",idx=1:50, 
                states=colnames(trans_matrix),
                xlab="Days since arrival", ylab="Log-rank test statistic", main="Readm-> Readm2")
## Error in plot.MarkovTest(cox_markov_test_prueba23, tseq, what = "states", : objeto 'cox_markov_test_prueba23' no encontrado
plot.MarkovTest(cox_markov_test_prueba34, tseq, what="states",idx=1:50, 
                states=colnames(trans_matrix),
                xlab="Days since arrival", ylab="Log-rank test statistic", main="Readm2-> Readm3")
## Error in plot.MarkovTest(cox_markov_test_prueba34, tseq, what = "states", : objeto 'cox_markov_test_prueba34' no encontrado
plot.MarkovTest(cox_markov_test_prueba45, tseq, what="states",idx=1:50, 
                states=colnames(trans_matrix),
                xlab="Days since arrival", ylab="Log-rank test statistic", main="Readm3-> Readm4")
## Error in plot.MarkovTest(cox_markov_test_prueba45, tseq, what = "states", : objeto 'cox_markov_test_prueba45' no encontrado
#$    tipo_de_plan_res + TD_1 - "tipo_de_plan_res_1 + TD_1"

plot.MarkovTest(cox_markov_test_prueba12d, tseq, what="states",idx=1:50, 
                states=colnames(trans_matrix),
                xlab="Days since arrival", ylab="Log-rank test statistic", main="Adm-> Readm")
## Error in plot.MarkovTest(cox_markov_test_prueba12d, tseq, what = "states", : objeto 'cox_markov_test_prueba12d' no encontrado
plot.MarkovTest(cox_markov_test_prueba23d, tseq, what="states",idx=1:50, 
                states=colnames(trans_matrix),
                xlab="Days since arrival", ylab="Log-rank test statistic", main="Readm-> Readm2")
## Error in plot.MarkovTest(cox_markov_test_prueba23d, tseq, what = "states", : objeto 'cox_markov_test_prueba23d' no encontrado
plot.MarkovTest(cox_markov_test_prueba34d, tseq, what="states",idx=1:50, 
                states=colnames(trans_matrix),
                xlab="Days since arrival", ylab="Log-rank test statistic", main="Readm2-> Readm3")
## Error in plot.MarkovTest(cox_markov_test_prueba34d, tseq, what = "states", : objeto 'cox_markov_test_prueba34d' no encontrado
plot.MarkovTest(cox_markov_test_prueba45d, tseq, what="states",idx=1:50, 
                states=colnames(trans_matrix),
                xlab="Days since arrival", ylab="Log-rank test statistic", main="Readm3-> Readm4")
## Error in plot.MarkovTest(cox_markov_test_prueba45d, tseq, what = "states", : objeto 'cox_markov_test_prueba45d' no encontrado

The model considered the transition from intermediate states to our absorbing state (being readmitted at the fourth time) is explained by the time spent in the previous health state. This covariate (time in the previous state) was shown to be statistically significant (p<.001); results indicated a longer duration spent in the first treatment is associated with increased risk of readmission. Therefore, a semi-Markov (called a Markov renewal model) or clock reset approach should be undertaken for both models.


Session Info

path<-rstudioapi::getSourceEditorContext()$path

Sys.getenv("R_LIBS_USER")
rstudioapi::getSourceEditorContext()

if (grepl("CISS Fondecyt",path)==T){
    save.image("C:/Users/CISS Fondecyt/Mi unidad/Alvacast/SISTRAT 2019 (github)/SUD_CL/mult_state_ago.RData")
  } else if (grepl("andre",path)==T){
    save.image("C:/Users/andre/Desktop/SUD_CL/mult_state_ago.RData")
  } else if (grepl("E:",path)==T){
    save.image("E:/Mi unidad/Alvacast/SISTRAT 2019 (github)/mult_state_ago.RData")
  } else {
    save.image("G:/Mi unidad/Alvacast/SISTRAT 2019 (github)/mult_state_ago.RData")
  }

sessionInfo()